## import libraries to be used 

In [None]:
#import libraries
import pandas as pd 
import numpy as np
import re 

import matplotlib.pyplot as plt 

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier,AdaBoostClassifier,ExtraTreesClassifier,GradientBoostingClassifier
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.metrics import log_loss

from string import punctuation
import warnings 
warnings.filterwarnings("ignore")
# seeding
np.random.seed(123)

## load the datasets ie train,test 

In [None]:
#load data 
train = pd.read_csv('train.csv', error_bad_lines=False)
test = pd.read_csv('test.csv')
sub= pd.read_csv('sample_submission.csv')

### shape of the datasets

In [None]:
print(train.shape)
print(test.shape)


## Data Preprocessing
### mapping categorical values

In [None]:
#data preprocessing 
# a mapping dictionary that maps the category values from 0 to 5
category_mapping = {
"kitaifa": 0,
"michezo": 1,
"burudani": 2,
"uchumi": 3,
"kimataifa": 4,
"afya": 5
}

train["category"] = train.category.map(category_mapping)

train.head()

In [None]:
train.category.value_counts().plot.barh()

### list of stopwords in swahili

In [None]:
sw_stopwords=["akasema","alikuwa","alisema","baada","basi","bila","cha","chini","hadi",
              "hapo","hata","hivyo","hiyo","huku","huo","ili","ilikuwa","juu","kama","karibu",
              "katika","kila","kima","kisha","kubwa","kutoka","kuwa","kwa","kwamba","kwenda","kwenye","la","lakini",
              "mara","mdogo","mimi","mkubwa","mmoja","moja","muda","mwenye","na","naye","ndani","ng","ni","nini",
              "nonkungu","pamoja","pia","sana","sasa","sauti","tafadhali","tena","tu","vile","wa",
              "wakati","wake","walikuwa","wao","watu","wengine","wote","ya","yake","yangu","yao","yeye","yule","za",
              "zaidi","zake","na","ya","wa","kwa","ni","za","katika","la","kuwa","kama","kwamba","cha","hiyo","lakini","yake","hata","wakati",
              "hivyo","sasa","wake","au","watu","hii","zaidi","vya","huo","tu","kwenye","si","pia","ili","moja","kila","baada","ambao","ambayo","yao","wao","kuna",
              "hilo","kutoka","kubwa","pamoja","bila","huu","hayo","sana","ndani","mkuu","hizo","kufanya","wengi","hadi","mmoja","hili","juu","kwanza","wetu","kuhusu",
              "baadhi","wote","yetu","hivi","kweli","mara","wengine","nini","ndiyo","zao","kati","hao","hapa","kutokana","muda","habari","ambaye","wenye","nyingine","hakuna",
              "tena","hatua","bado","nafasi","basi","kabisa","hicho","nje","huyo","vile","yote","mkubwa","alikuwa","zote","leo","haya","huko","kutoa","mwa","kiasi","hasa","nyingi","kabla","wale","chini","gani","hapo","lazima","mwingine","bali","huku","zake","ilikuwa",
              "tofauti","kupata","mbalimbali","pale","kusema","badala","wazi","yeye","alisema","hawa",
              "ndio","hizi","tayari","wala","muhimu","ile","mpya","ambazo","dhidi","kwenda","sisi","kwani",
              "jinsi","binafsi","kutumia","mbili","mbali","kuu","mengine","mbele","namna","mengi","upande","na","lakini","ingawa"
              "ingawaje","kwa","sababu","hadi","hata","kama","ambapo","ambamo","ambako","ambacho","ambao","ambaye","ilhali","ya","yake","yao","yangu","yetu","yenu","vya","vyao","vyake","vyangu",
"vyenu","vyetu","yako","yao","hizo","yenu","mimi","sisi","wewe","nyinyi","yeye","wao","nao","nasi","nanyi","ni","alikuwa","atakuwa","hii","hizi","zile",
"ile","hivi","vile","za","zake","zao","zenu","kwenye","katika","kwa","kwao","kwenu","kwetu","dhidi","kati","miongoni","katikati","wakati","kabla","baada",
"baadaye","nje","tena","mbali","halafu","hapa","pale","mara","mara","yoyote","wowote","chochote","vyovyote","yeyote","lolote","mwenye","mwenyewe","lenyewe",
"lenye","wote","lote","vyote","nyote","kila","zaidi","hapana","ndiyo","au","ama","ama","sio","siye","tu","budi","nyingi","nyingine","wengine","mwingine",
"zingine","lingine","kingine","chote","sasa","basi","bila","cha","chini","hapo","pale","huku","kule","humu","hivyo","hivyohivyo","vivyo","palepale","fauka",
"hiyo","hiyohiyo","zile","zilezile","hao","haohao","huku","hukuhuku","humuhumu","huko","hukohuko","huo","huohuo","hili","hilihili","ilikuwa","juu","karibu",
"kila","kima","kisha","kutoka","kwenda","kubwa","ndogo","kwamba","kuwa","la","lao","lo","mara","na",
"mdogo","mkubwa","ng’o","pia","aidha","vile","vilevile","kadhalika","halikadhalika","ni","sana","pamoja","pamoja","tafadhali","tena",
"wa","wake","wao",
"ya","yule","wale","zangu","nje","afanaleki","salale","oyee","yupi","ipi","lipi","ngapi","yetu","si","angali","wangali","loo","la","ohoo",
"barabara","oyee",
"ewaa","walahi","masalale","duu","toba","mh","kumbe","ala","ebo","haraka","pole","polepole","harakaharaka","hiyo","hivyo","vyovyote",
"atakuwa","itakuwa","mtakuwa",
"tutakuwa","labda","yumkini","haiyumkini","yapata","takribani","hususani","yawezekana","nani","juu""chini",
"ndani","baadhi","kuliko","vile","mwa","kwa","hasha","hivyo","moja","kisha",
"pili","kwanza","ili","je","jinsi","ila","ila","nini","hasa","huu","zako","mimi",
]

## import nltk for Natural Language Processing

In [None]:
import nltk
from nltk.tokenize import word_tokenize 

In [None]:
trainc= train.copy()
testc=test.copy()

### clean our dataset

In [None]:
import string

def clean_text(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text


# Applying the cleaning function to both test and training datasets
train['content'] = train['content'].apply(lambda x: clean_text(x))
test['content'] = test['content'].apply(lambda x: clean_text(x))

# Let's take a look at the updated text
train['content'].head()

### Tokenize our data

In [None]:
# Tokenizing the training and the test set
tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
train['content'] = train['content'].apply(lambda x: tokenizer.tokenize(x))
test['content'] = test['content'].apply(lambda x: tokenizer.tokenize(x))
train['content'].head()

### Remove Stopwords in Dataset

In [None]:
def remove_stopwords(text):
    """
    Removing stopwords belonging to swahili language
    
    """
    words = [w for w in text if w not in sw_stopwords]
    return words

train['content'] = train['content'].apply(lambda x : remove_stopwords(x))
test['content'] = test['content'].apply(lambda x : remove_stopwords(x))
train.head()

In [None]:
# After preprocessing, the text format
def combine_text(list_of_text):
    '''Takes a list of text and combines them into one large chunk of text.'''
    combined_text = ' '.join(list_of_text)
    return combined_text
train['content']=train['content'].apply(lambda x: combine_text(x))
test['content'] = test['content'].apply(lambda x : combine_text(x))

### TFIDF Vectorizer

In [None]:
tfidf = TfidfVectorizer(min_df=15, max_df=0.5, ngram_range=(1, 2),norm='l2',sublinear_tf=True)
train_vectors = tfidf.fit_transform(train['content'])
test_vectors = tfidf.transform(test["content"])
#10

## Train our model and evaluate the log loss

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

from sklearn.linear_model import SGDClassifier
from xgboost import XGBClassifier

from sklearn.metrics import f1_score
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from sklearn.model_selection import GridSearchCV,StratifiedKFold,RandomizedSearchCV


### split our data into train and test set 

In [None]:
#split our data into train and test
from sklearn.model_selection import train_test_split

#split features and target from train data 
X = train_vectors
y = train.category.values

In [None]:
#from sklearn.preprocessing import MinMaxScaler

#scaler = MinMaxScaler()
#X_norm = scaler.fit_transform(X.toarray())
#X_test_norm = scaler.transform(test_vectors.toarray())

In [None]:
# split data into train and validate

X_train, X_valid, y_train, y_valid = train_test_split(
    X,
    y,
    test_size=0.20,
    random_state=42,
    shuffle=True,
    stratify=y,
)

In [None]:
# Create a classifier
news_classifier = MultinomialNB()

# train the news_classifier 
news_classifier.fit(X_train,y_train)
# test model performance on valid data 
y_probas = news_classifier.predict_proba(X_valid)
# evalute model performance by using log_loss in the validation data
log_loss(y_valid, y_probas)

In [None]:
# create prediction from the test data
test_probas = news_classifier.predict_proba(test_vectors)

In [None]:
rfr=RandomForestClassifier(n_estimators=150, max_depth=6, random_state=0)
rfr.fit(X_train,y_train)
rfr_probas= rfr.predict_proba(X_valid)
log_loss(y_valid,rfr_probas)

In [None]:
xgb=XGBClassifier()
xgb.fit(X_train,y_train)
xgb_probas= xgb.predict_proba(X_valid)
log_loss(y_valid,xgb_probas)

In [None]:
#logistic regression
logr=LogisticRegression(max_iter=150,C=7, random_state=23)
logr.fit(X_train,y_train)
logr_probas= logr.predict_proba(X_valid)
log_loss(y_valid,logr_probas)

In [None]:
# create prediction from the test data
logr_probass =logr.predict_proba(test_vectors)


### create a submission file 

In [None]:
# create submission file 
submission_cols = ['kitaifa', 'michezo', 'burudani','uchumi', 'kimataifa', 'afya'] 
submission_df = pd.DataFrame(logr_probass, columns = submission_cols)
submission_df['test_id'] = sub['test_id']   # add  test_id 

#rearange columns 
submission_df = submission_df[['test_id','kitaifa', 'michezo', 'burudani','uchumi', 'kimataifa', 'afya']]

# save submission file 
submission_df.to_csv("first_submission.csv",index=False) 