In [175]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.svm import LinearSVC
from imblearn.over_sampling import ADASYN, SMOTE, RandomOverSampler
from sklearn.utils import shuffle
import pickle
import nltk
from nltk import word_tokenize
from nltk.tokenize import WordPunctTokenizer
from sklearn.utils import resample

In [176]:
data = './lang_data.csv'
df = pd.read_csv(data, encoding = 'ISO-8859-1')

In [177]:
print(df)

                                                   text    language
0                        Ship shape and Bristol fashion     English
1                                        Know the ropes     English
2                                       Graveyard shift     English
3                                Milk of human kindness     English
4                    Touch with a barge-pole - Wouldn't     English
5                    Sy kan altyd my battery natpiepie.   Afrikaans
6                            When the shit hits the fan     English
7                                                   NaN   Afrikaans
8                                                Egg on     English
9                                             Drag race     English
10                          As queer as a nine bob note     English
11                                                  NaN   Afrikaans
12                                     Run the gauntlet     English
13                                Raining cats a

In [178]:
df=resample(df, random_state=0)
df

Unnamed: 0,text,language
2732,A priori,English
2607,Great minds think alike,English
1653,"Zoals de waard is, vertrouwt hij zijn gasten.",Nederlands
835,Bag and baggage,English
763,Below the belt,English
1731,The smallest room in the house,English
1033,Fools' gold,English
277,Crocodile tears,English
1778,Bald as a coot,English
1828,Bog standard,English


In [179]:
df['language'].value_counts()

English       2007
Afrikaans      710
Nederlands     122
Name: language, dtype: int64

In [180]:
df.isnull().sum(axis = 0)

text        94
language     0
dtype: int64

In [181]:
df['text'].isnull().sum()

94

In [182]:
df['language'].isnull().sum()

0

In [183]:
df.dropna(inplace=True)

In [184]:
df['text'].isnull().sum()

0

In [185]:
df['language'].value_counts()

English       1984
Afrikaans      679
Nederlands      82
Name: language, dtype: int64

## Cleaning function

In [186]:
tok = WordPunctTokenizer()
def document_cleaner(text):
    words = [x for x in  tok.tokenize(text) if x.isalpha()]
    words = " ".join(words)
    return(words.lower())

In [187]:
lsvc = LinearSVC(C=0.1)
count_vect = CountVectorizer(ngram_range=(1,1))#, encoding='latin-1')    

## Main function to run the for training of the model

In [188]:
def main(df, p):
    df['text'] = df['text'].apply(document_cleaner)
    ros = RandomOverSampler(random_state=777)
    x_train, x_test, y_train, y_test = train_test_split(df['text'], df['language'], test_size=0.7, random_state=2000)
  
    x_train = count_vect.fit_transform(x_train)
    tfidf_vectorizer = TfidfVectorizer(use_idf=True, ngram_range=(1,2))
    vectorize = TfidfTransformer()
    x_train = vectorize.fit_transform(x_train)
    X_ROS, y_ROS = ros.fit_sample(x_train, y_train)
    
    #fit the model
    lsvc.fit(X_ROS, y_ROS)
    
    #prediction
    prediction_1 = lsvc.predict(count_vect.transform(df['text']))
    prediction_2 = lsvc.predict(count_vect.transform(x_test))
    
    #print("LinearSVC Model: 2gram")
    print("LinearSVC Model: Bigram- Scores for the whole dataset")
    print(classification_report(prediction_1, df['language']))
    print("LinearSVC Model: Bigram- Scores for the test dataset")
    print('Accuracy for the tese dataset:',accuracy_score(y_test, prediction_2))
    print('Classification report:''\n',classification_report(y_test, prediction_2))
    print('Confusion matrix for the test dataset: ''\n', confusion_matrix(y_test, prediction_2))
    

In [189]:
print(main(df, 35))

LinearSVC Model: Bigram- Scores for the whole dataset
              precision    recall  f1-score   support

   Afrikaans       0.98      0.94      0.96       702
     English       0.98      0.99      0.99      1962
  Nederlands       0.94      0.95      0.94        81

    accuracy                           0.98      2745
   macro avg       0.97      0.96      0.96      2745
weighted avg       0.98      0.98      0.98      2745

LinearSVC Model: Bigram- Scores for the test dataset
Accuracy for the tese dataset: 0.9745057232049948
Classification report:
               precision    recall  f1-score   support

   Afrikaans       0.94      0.97      0.96       477
     English       0.99      0.98      0.98      1391
  Nederlands       0.92      0.91      0.92        54

    accuracy                           0.97      1922
   macro avg       0.95      0.95      0.95      1922
weighted avg       0.97      0.97      0.97      1922

Confusion matrix for the test dataset: 
 [[ 462   14    1

In [99]:
pickle.dump(lsvc, open('model_1.pkl','wb'))
pickle.dump(count_vect, open('model_2.pkl', 'wb'))

In [100]:
lsvc = pickle.load(open('model_1.pkl','rb'))
count_vect = pickle.load(open('model_2.pkl', 'rb'))

In [101]:
df['text']

444                     hy kedoem sy kitaar so nou en dan
851                                           grond vreet
751                                   a fish out of water
277                                       crocodile tears
988                                          stand up guy
1960    that s one small step for a man a giant leap f...
135                       soebat en bangoog kyk help niks
1062                                        saartjie soen
1441                    haar haan moet altyd koning kraai
2533                            a watched pot never boils
1734    mens moet nie werk om te lewe nie â hy moet le...
577                        scuse me while i kiss this guy
1384                                     cloak and dagger
1967                                        loaf of bread
1115                 daai manne by die spele ry toe speek
1434                       heard it through the grapevine
2752                           whole kit and caboodle the
1220          