#####  Sihle_Riti_Classification_Hack

1. Introduction
2. Import libraries and load data
3. Data pre-processing
4. Exploratory Data Analysis

### 1. Introduction

South Africa is a multicultural society that is characterised by its rich linguistic diversity. Language is an indispensable tool that can be used to deepen democracy and also contribute to the social, cultural, intellectual, economic and political life of the South African society.

### 2: Import libraries and load data

In [56]:
# Standard
import pandas as pd
import numpy as np
import time
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Natural language Processing
import nltk
import string
import re
from sklearn.utils import resample
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

# Models
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, KFold
from sklearn.linear_model import LogisticRegression, SGDClassifier, RidgeClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV, RepeatedStratifiedKFold
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier, StackingClassifier

# Performance
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn import metrics

In [57]:
train = pd.read_csv('train_set.csv')
test = pd.read_csv('test_set.csv')
sample = pd.read_csv('sample_submission.csv')

In [58]:
train.head(11)

Unnamed: 0,lang_id,text
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko ax...
1,xho,i-dha iya kuba nobulumko bokubeka umsebenzi na...
2,eng,the province of kwazulu-natal department of tr...
3,nso,o netefatša gore o ba file dilo ka moka tše le...
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana...
5,nso,dinyakišišo tše tša go dirwa gabedi ka ngwaga ...
6,tsn,kgetse nngwe le nngwe e e sa faposiwang mo tsh...
7,ven,mbadelo dze dza laelwa dzi do kwama mahatulele...
8,nso,maloko a dikhuduthamaga a ikarabela mongwe le ...
9,tsn,fa le dirisiwa lebone le tshwanetse go bontsha...


In [59]:
test.head(11)

Unnamed: 0,index,text
0,1,"Mmasepala, fa maemo a a kgethegileng a letlele..."
1,2,Uzakwaziswa ngokufaneleko nakungafuneka eminye...
2,3,Tshivhumbeo tshi fana na ngano dza vhathu.
3,4,Kube inja nelikati betingevakala kutsi titsini...
4,5,Winste op buitelandse valuta.
5,6,"Ke feela dilense tše hlakilego, tša pono e tee..."
6,7,<fn>(762010101403 AM) 1495 Final Gems Birthing...
7,8,Ntjhafatso ya konteraka ya mosebetsi: Etsa bon...
8,9,u-GEMS uhlinzeka ngezinzuzo zemithi yezifo ezi...
9,10,"So, on occasion, are statistics misused."


In [60]:
sample.head()

Unnamed: 0,index,lang_id
0,1,tsn
1,2,nbl


### 3. Preprocessing

In [63]:
def clean_text(text):

    # change all words into lower case
    text = text.lower()

    # removing all punctuation and digits
    text = re.sub(r'[-]',' ',text)
    text = re.sub(r'[^\w\s]','',text)
    text = re.sub('[0-9]+', '', text)


    text = re.sub("â|ã", " ", text)  # removes strange character    
    text = re.sub("\\s+", " ", text)  # fills white spaces
    text = text.lstrip()  # removes whitespaces before string
    text = text.rstrip()  # removes whitespaces after string 
    return text


In [64]:
#Apply the clean function to our train and test data
train['clean_text']=train['text'].apply(clean_text)
test['clean_text']=test['text'].apply(clean_text)

In [65]:
train.head(11)

Unnamed: 0,lang_id,text,clean_text
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko ax...,umgaqo siseko wenza amalungiselelo kumaziko ax...
1,xho,i-dha iya kuba nobulumko bokubeka umsebenzi na...,i dha iya kuba nobulumko bokubeka umsebenzi na...
2,eng,the province of kwazulu-natal department of tr...,the province of kwazulu natal department of tr...
3,nso,o netefatša gore o ba file dilo ka moka tše le...,o netefatša gore o ba file dilo ka moka tše le...
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana...,khomishini ya ndinganyiso ya mbeu yo ewa maana...
5,nso,dinyakišišo tše tša go dirwa gabedi ka ngwaga ...,dinyakišišo tše tša go dirwa gabedi ka ngwaga ...
6,tsn,kgetse nngwe le nngwe e e sa faposiwang mo tsh...,kgetse nngwe le nngwe e e sa faposiwang mo tsh...
7,ven,mbadelo dze dza laelwa dzi do kwama mahatulele...,mbadelo dze dza laelwa dzi do kwama mahatulele...
8,nso,maloko a dikhuduthamaga a ikarabela mongwe le ...,maloko a dikhuduthamaga a ikarabela mongwe le ...
9,tsn,fa le dirisiwa lebone le tshwanetse go bontsha...,fa le dirisiwa lebone le tshwanetse go bontsha...


In [66]:
test.head(11)

Unnamed: 0,index,text,clean_text
0,1,"Mmasepala, fa maemo a a kgethegileng a letlele...",mmasepala fa maemo a a kgethegileng a letlelel...
1,2,Uzakwaziswa ngokufaneleko nakungafuneka eminye...,uzakwaziswa ngokufaneleko nakungafuneka eminye...
2,3,Tshivhumbeo tshi fana na ngano dza vhathu.,tshivhumbeo tshi fana na ngano dza vhathu
3,4,Kube inja nelikati betingevakala kutsi titsini...,kube inja nelikati betingevakala kutsi titsini...
4,5,Winste op buitelandse valuta.,winste op buitelandse valuta
5,6,"Ke feela dilense tše hlakilego, tša pono e tee...",ke feela dilense tše hlakilego tša pono e tee ...
6,7,<fn>(762010101403 AM) 1495 Final Gems Birthing...,fn am final gems birthing options_zulutxtfn
7,8,Ntjhafatso ya konteraka ya mosebetsi: Etsa bon...,ntjhafatso ya konteraka ya mosebetsi etsa bonn...
8,9,u-GEMS uhlinzeka ngezinzuzo zemithi yezifo ezi...,u gems uhlinzeka ngezinzuzo zemithi yezifo ezi...
9,10,"So, on occasion, are statistics misused.",so on occasion are statistics misused


In [67]:
train.lang_id.value_counts()

ven    3000
sot    3000
nbl    3000
tso    3000
nso    3000
afr    3000
ssw    3000
xho    3000
eng    3000
tsn    3000
zul    3000
Name: lang_id, dtype: int64

In [69]:
X = train['text']
y = train['lang_id'] 


X_train , X_test , y_train , y_test = train_test_split(X , y, test_size =0.10)

In [70]:
%%time
classifiers = [LinearSVC(random_state=42),
               LogisticRegression(random_state=42,
                                  multi_class='ovr',
                                  n_jobs=1,
                                  C=1e5,
                                  max_iter=4000),
               KNeighborsClassifier(n_neighbors=5),
               MultinomialNB(),
               SGDClassifier(loss='hinge',
                             penalty='l2',
                             alpha=1e-3,
                             random_state=42,
                             max_iter=5,
                             tol=None)
    ]


Wall time: 217 ms
Parser   : 119 ms


In [71]:
def model(classifiers, X_train, y_train, X_test,y_test):
    
    model_summary = {}
    
    # Pipeline to balance the classses and then to build the model
    for clf in classifiers:
        clf_text = Pipeline([('tfidf', TfidfVectorizer(min_df=1, max_df=0.9, ngram_range=(1, 2))),
                             ('clf', clf)
                            ])

    # Execution Time Logging
    start_time = time.time()
    clf_text.fit(X_train, y_train)
    predictions = clf_text.predict(X_test)
    run_time = time.time()-start_time
      
    # Models Performance
    model_summary[clf.__class__.__name__] = {
        'F1-Macro':metrics.f1_score(y_test,predictions,average='macro'),
        'F1-Accuracy':metrics.f1_score(y_test,predictions,average='micro'),
        'F1-Weighted':metrics.f1_score(y_test,predictions,average='weighted'),
        'Execution Time': run_time }
    
    return pd.DataFrame.from_dict(model_summary, orient='index')

In [72]:
%%time
clf_df = model(classifiers,X_train, y_train, X_test, y_test)
The_clf_df = clf_df.sort_values('F1-Weighted',ascending=False)
The_clf_df

Wall time: 7.99 s


Unnamed: 0,F1-Macro,F1-Accuracy,F1-Weighted,Execution Time
SGDClassifier,0.988202,0.988182,0.988132,7.459208


In [80]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.02)

In [81]:
# Creating a pipeline for the gridsearch
param_grid = {'alpha': [0.1, 1, 5, 10]}  # setting parameter grid

Sgd = Pipeline([('tfidf', TfidfVectorizer(min_df=2,
                                                max_df=0.9,
                                                ngram_range=(1, 2))),
                      ('Sgd', GridSearchCV(LinearSVC(),
                                           param_grid=param_grid,
                                           cv=5,
                                           n_jobs=-1,
                                           scoring='f1_weighted'))
                      ])

.fit(X_train, y_train)  # Fitting the model

y_pred_Sgd = Sgd.predict(X_test)  # predicting the fit on validation set


print(classification_report(y_test, y_pred_Sgd))

ValueError: Invalid parameter alpha for estimator LinearSVC(). Check the list of available parameters with `estimator.get_params().keys()`.

In [82]:
submit_df = pd.DataFrame(test['index'])
submit_df['lang_id'] = Sgd.predict(test['text'])
submit_df.to_csv('submit_SGDClassifier.csv', index=False)