In [1]:

# Packages for data analysis
import pandas as pd
import numpy as np
import time

# Packages for visualizations
import seaborn as sns
import matplotlib.style as style

# Packages for preprocessing
import nltk
import string
import re
from sklearn.feature_extraction.text import TfidfVectorizer

# Packages for training models
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.svm import LinearSVC, SVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn import metrics
# Model Evaluation Packages
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix, classification_report, f1_score
from sklearn.metrics import make_scorer
from sklearn.svm import LinearSVC

import matplotlib.pyplot as plt
%matplotlib inline

# Style
sns.set(font_scale=1.5)
style.use('seaborn-pastel')
style.use('seaborn-poster')

In [2]:
# Importing the dataset
train = pd.read_csv('train_set.csv')
test = pd.read_csv('test_set.csv')
sample_submission = pd.read_csv('sample_submission.csv')

In [3]:
train.describe

<bound method NDFrame.describe of       lang_id                                               text
0         xho  umgaqo-siseko wenza amalungiselelo kumaziko ax...
1         xho  i-dha iya kuba nobulumko bokubeka umsebenzi na...
2         eng  the province of kwazulu-natal department of tr...
3         nso  o netefatša gore o ba file dilo ka moka tše le...
4         ven  khomishini ya ndinganyiso ya mbeu yo ewa maana...
...       ...                                                ...
32995     tsn  popo ya dipolateforomo tse ke go tlisa boetele...
32996     sot  modise mosadi na o ntse o sa utlwe hore thaban...
32997     eng  closing date for the submission of completed t...
32998     xho  nawuphina umntu ofunyenwe enetyala phantsi kwa...
32999     sot  mafapha a mang le ona a lokela ho etsa ditlale...

[33000 rows x 2 columns]>

In [4]:
train.head()

Unnamed: 0,lang_id,text
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko ax...
1,xho,i-dha iya kuba nobulumko bokubeka umsebenzi na...
2,eng,the province of kwazulu-natal department of tr...
3,nso,o netefatša gore o ba file dilo ka moka tše le...
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana...


In [5]:
test.head()

Unnamed: 0,index,text
0,1,"Mmasepala, fa maemo a a kgethegileng a letlele..."
1,2,Uzakwaziswa ngokufaneleko nakungafuneka eminye...
2,3,Tshivhumbeo tshi fana na ngano dza vhathu.
3,4,Kube inja nelikati betingevakala kutsi titsini...
4,5,Winste op buitelandse valuta.


In [6]:
sample_submission.head()

Unnamed: 0,index,lang_id
0,1,tsn
1,2,nbl


In [28]:
train.shape

(33000, 2)

In [8]:
#removal of patterns
pattern_url = r'http[s]?://(?:[A-Za-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9A-Fa-f][0-9A-Fa-f]))+'
subs_url = r'url-web'
train['text'] = train['text'].replace(to_replace = pattern_url, value = subs_url, regex = True)
test['text'] = test['text'].replace(to_replace = pattern_url, value = subs_url, regex = True)

In [9]:
# Checking if character is white-space character or not
print(len(train['text']))
print(sum(train['text'].apply(lambda x: x.isspace())))

33000
0


In [10]:
# Checking if character is white-space character or not
print(len(test['text']))
print(sum(test['text'].apply(lambda x: x.isspace())))

5682
0


In [11]:
import string
def remove_punctuation(post):
    return ''.join([l for l in post if l not in string.punctuation])

In [12]:
train['text'] = train['text'].apply(remove_punctuation)
train['text'].iloc[0]
test['text'] = test['text'].apply(remove_punctuation)
test['text'].iloc[0]

'Mmasepala fa maemo a a kgethegileng a letlelela kgato eo'

In [15]:
weird = ['tÃƒÂ¢Ã¢â€šÂ¬Ã‚Â¦ï¿½ï¿½ï¿½ï¿½ï™']
normal = 'abcdefghijklmnopqrstuvwxyz123456789 \n'
def remove_weird_chars(post):
    return ''.join([l for l in post if l in normal])

In [17]:
train['text'] = train['text'].apply(remove_weird_chars)
train['text'].iloc[0]
test['text'] = test['text'].apply(remove_weird_chars)
test['text'].iloc[0]

'masepala fa maemo a a kgethegileng a letlelela kgato eo'

In [19]:
X = train["text"]
y = train["lang_id"]

# Split dataset

In [20]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [21]:
X_train

12463    ge e le gore ga go le e tee ya te ta ka godimo...
9378     namhlanjesi sinehlelo lokubusa lentando yeneng...
9477     timhaka tinwana leti vekiweke kumbe iii mhaka ...
2142     laola tshebediso le maemo a mehlodi basebedisi...
23983    akukho mntu ofanelwe sisenzo somthetho okanye ...
                               ...                        
16850    lesi sifunda saziwa ngokwehlukahlukana kwezinh...
6265     lw die uitvoerpermit waarna verwys word sluit ...
11284    ka ntlha ya fa bosiamisi ba kagosea bo theetsw...
860      tsweletso go tlhalosa bokao le tiriso ya mafok...
15795    ka tsela e jwalo ba lelapa la bongi le metswal...
Name: text, Length: 29700, dtype: object

In [22]:
vectorise = TfidfVectorizer()
vectorise.fit_transform(X_train)

<29700x135246 sparse matrix of type '<class 'numpy.float64'>'
	with 831753 stored elements in Compressed Sparse Row format>

# Train model

In [26]:
svc_Model = LinearSVC()
svc_Model.fit(vectorise.transform(X_train), y_train)
y_pred = svc_Model.predict(vectorise.transform(X_test))
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         afr       1.00      1.00      1.00       281
         eng       1.00      1.00      1.00       297
         nbl       1.00      0.99      1.00       327
         nso       1.00      0.99      1.00       322
         sot       1.00      1.00      1.00       307
         ssw       0.99      1.00      0.99       286
         tsn       1.00      1.00      1.00       297
         tso       1.00      1.00      1.00       253
         ven       1.00      1.00      1.00       322
         xho       0.99      1.00      1.00       313
         zul       0.99      0.99      0.99       295

    accuracy                           1.00      3300
   macro avg       1.00      1.00      1.00      3300
weighted avg       1.00      1.00      1.00      3300



In [32]:
predict1= svc_Model.predict(vectorise.transform(test['text']))

In [33]:
#Make Predictions
prediction= pd.DataFrame(test['index'], columns=['index'])
prediction.insert(1, 'lang_id', predict1, allow_duplicates=False)
prediction.to_csv('scvmodel.csv', index=False)

prediction.head()


Unnamed: 0,index,lang_id
0,1,tsn
1,2,nbl
2,3,ven
3,4,ssw
4,5,afr
