In [35]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.svm import LinearSVC
from imblearn.over_sampling import SMOTE,RandomOverSampler
import pickle
from nltk.tokenize import WordPunctTokenizer
from sklearn.utils import resample

In [36]:
data = './lang_data.csv'
df = pd.read_csv(data, encoding = 'ISO-8859-1')

In [37]:
print(df)

                                                   text   language
0                        Ship shape and Bristol fashion    English
1                                        Know the ropes    English
2                                       Graveyard shift    English
3                                Milk of human kindness    English
4                    Touch with a barge-pole - Wouldn't    English
...                                                 ...        ...
2834                        Daarâs ân geurtjie aan.  Afrikaans
2835  Men's evil manners live in brass; their virtue...    English
2836                                          Go-faster    English
2837                                           Red tape    English
2838                                        In a pickle    English

[2839 rows x 2 columns]


In [38]:
#Resample data
df=resample(df, random_state=0)
df

Unnamed: 0,text,language
2732,A priori,English
2607,Great minds think alike,English
1653,"Zoals de waard is, vertrouwt hij zijn gasten.",Nederlands
835,Bag and baggage,English
763,Below the belt,English
...,...,...
508,Fall off the back of a lorry,English
2208,Geld groei nie op my rug nie.,Afrikaans
2273,De puntjes op de i zetten.,Nederlands
1446,Chickens come home to roost,English


In [39]:
# Checking the distribution of classes
df['language'].value_counts()

English       2007
Afrikaans      710
Nederlands     122
Name: language, dtype: int64

In [40]:
#Checking null entries
df.isnull().sum(axis = 0)

text        94
language     0
dtype: int64

In [41]:
# Checking null entries
df['text'].isnull().sum()

94

In [42]:
#Checking null entries
df['language'].isnull().sum()

0

In [43]:
# Remove any null entries
df.dropna(inplace=True)

In [44]:
df['text'].isnull().sum()

0

In [45]:
#Checking the distribution of classes after removing the null entries
df['language'].value_counts()

English       1984
Afrikaans      679
Nederlands      82
Name: language, dtype: int64

## Cleaning function
- First tokenize the words in a sentence, then return tokens that are alphabets.
- Join the tokens back into a sentence, and convert the words into lowercases.

In [46]:
tok = WordPunctTokenizer()
def document_cleaner(text):
    words = [x for x in  tok.tokenize(text) if x.isalpha()]
    words = " ".join(words)
    return(words.lower())

In [47]:
# Defining the model 
lsvc = LinearSVC(C=0.1)
count_vect = CountVectorizer(ngram_range=(1,1))#, encoding='latin-1')    

## Main function to run the for training the model

In [48]:
def main(df, p):
    df['text'] = df['text'].apply(document_cleaner)
    """ random oversampling technique to increase the number of minority classes/labels"""
    ros = RandomOverSampler(random_state=777)
    """Split the data into 30% for training and 70% for testing """
    x_train, x_test, y_train, y_test = train_test_split(df['text'], df['language'], test_size=0.7, random_state=2000)
  
    x_train = count_vect.fit_transform(x_train)
    """Calculate Tfidf for Bi-grams"""
    tfidf_vectorizer = TfidfVectorizer(use_idf=True, ngram_range=(1,2))
    vectorize = TfidfTransformer()
    x_train = vectorize.fit_transform(x_train)
    X_ROS, y_ROS = ros.fit_sample(x_train, y_train)
    
    #fit the model
    lsvc.fit(X_ROS, y_ROS)
    
    #prediction
    prediction_1 = lsvc.predict(count_vect.transform(df['text']))
    prediction_2 = lsvc.predict(count_vect.transform(x_test))
    
    print("LinearSVC Model: Bigram- Scores for the whole dataset")
    print(classification_report(prediction_1, df['language']))
    print("LinearSVC Model: Bigram- Scores for the test dataset")
    print('Accuracy for the tese dataset:',accuracy_score(y_test, prediction_2))
    print('Classification report:''\n',classification_report(y_test, prediction_2))
    print('Confusion matrix for the test dataset: ''\n', confusion_matrix(y_test, prediction_2))
    

In [49]:
print(main(df, 35))

LinearSVC Model: Bigram- Scores for the whole dataset
              precision    recall  f1-score   support

   Afrikaans       0.98      0.94      0.96       702
     English       0.98      0.99      0.99      1962
  Nederlands       0.94      0.95      0.94        81

    accuracy                           0.98      2745
   macro avg       0.97      0.96      0.96      2745
weighted avg       0.98      0.98      0.98      2745

LinearSVC Model: Bigram- Scores for the test dataset
Accuracy for the tese dataset: 0.9745057232049948
Classification report:
               precision    recall  f1-score   support

   Afrikaans       0.94      0.97      0.96       477
     English       0.99      0.98      0.98      1391
  Nederlands       0.92      0.91      0.92        54

    accuracy                           0.97      1922
   macro avg       0.95      0.95      0.95      1922
weighted avg       0.97      0.97      0.97      1922

Confusion matrix for the test dataset: 
 [[ 462   14    1

In [50]:
#Save model and the count_vect array (matrix)
pickle.dump(lsvc, open('model_1.pkl','wb'))
pickle.dump(count_vect, open('model_2.pkl', 'wb'))

In [51]:
lsvc = pickle.load(open('model_1.pkl','rb'))
count_vect = pickle.load(open('model_2.pkl', 'rb'))

In [52]:
df['text']

2732                                       a priori
2607                        great minds think alike
1653    zoals de waard is vertrouwt hij zijn gasten
835                                 bag and baggage
763                                  below the belt
                           ...                     
508                    fall off the back of a lorry
2208                   geld groei nie op my rug nie
2273                      de puntjes op de i zetten
1446                    chickens come home to roost
2628                          christmas card verses
Name: text, Length: 2745, dtype: object