In [None]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from wordcloud import WordCloud, STOPWORDS
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, recall_score, precision_score
import re
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from xgboost import XGBClassifier
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV

# Import warnings
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=DeprecationWarning)

In [None]:
#Load the Data
train = pd.read_csv('train_set.csv')
test = pd.read_csv('test_set.csv')

In [None]:
train.head()

In [None]:
test.head()

In [None]:
train.isnull().sum() #checking for null values

In [None]:
train['lang_id'].value_counts()

In [None]:
train.info()

In [None]:
#Base models
names = ['Logistic Regression','Random Forest', 'Nearest Neighbors', 
         'Decision Tree','MultinomialNB','Linear SVC', 'XG Boost']

In [None]:
classifiers = [
    Pipeline([('tfid', TfidfVectorizer(max_df = 0.4,
                                      smooth_idf = True,
                                      stop_words = 'english',
                                      ngram_range = (1,2))),
             ('clf', LogisticRegression())]),
    Pipeline([('tfid', TfidfVectorizer(max_df = 0.4,
                                      smooth_idf = True,
                                      stop_words = 'english',
                                      ngram_range = (1,2))),
             ('clf', RandomForestClassifier())]),
    Pipeline([('tfid', TfidfVectorizer(max_df = 0.4,
                                      smooth_idf = True,
                                      stop_words = 'english',
                                      ngram_range = (1,2))),
             ('clf', KNeighborsClassifier())]),
    Pipeline([('tfid', TfidfVectorizer(max_df = 0.4,
                                      smooth_idf = True,
                                      stop_words = 'english',
                                      ngram_range = (1,2))),
             ('clf', DecisionTreeClassifier())]),
    Pipeline([('tfid', TfidfVectorizer(max_df = 0.4,
                                      smooth_idf = True,
                                      stop_words = 'english',
                                      ngram_range = (1,2))),
             ('clf', MultinomialNB())]),
    Pipeline([('tfid', TfidfVectorizer(max_df = 0.4,
                                      smooth_idf = True,
                                      stop_words = 'english',
                                      ngram_range = (1,2))),
             ('clf', LinearSVC())]),
        Pipeline([('tfid', TfidfVectorizer(max_df = 0.4,
                                      smooth_idf = True,
                                      stop_words = 'english',
                                      ngram_range = (1,2))),
             ('clf', XGBClassifier())])
]

In [None]:
y = train['lang_id']
X = train['text']
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 10)

In [None]:
results = []

models = {}
confusion = {}
class_report = {}


for name, clf in zip(names, classifiers):    
    print ('Fitting {:s} model...'.format(name))
    run_time = %timeit -q -o clf.fit(X_train, y_train)
    
    print ('... predicting')
    y_pred = clf.predict(X_train)   
    y_pred_test = clf.predict(X_test)
    
    models[name] = clf    
    
    results.append([name, run_time.best])

    
results = pd.DataFrame(results, columns=['Classifier', 'Train Time'])
results.set_index('Classifier', inplace= True)

In [None]:
#Logistic Regresion
lr = models['Logistic Regression']
t = test['text']
y_pred_lr = lr.predict(t)
sub = pd.DataFrame( data = {'index': test['index'],
                             'lang_id': y_pred_lr })
sub.to_csv('submission_lr2.csv', index = False, quoting = 3)

#Random forest
rf = models['Random Forest']
y_pred_rf = rf.predict(t)
sub = pd.DataFrame( data = {'index': test['index'],
                             'lang_id': y_pred_rf })
sub.to_csv('submission_fr.csv', index = False, quoting = 3)

#Nearest Neighbors
nn = models['Nearest Neighbors']
y_pred_nn = nn.predict(t)
sub = pd.DataFrame( data = {'index': test['index'],
                             'lang_id': y_pred_nn })
sub.to_csv('submission_nn.csv', index = False, quoting = 3)

#Decision Tree
dt = models['Decision Tree']
y_pred_dt = dt.predict(t)
sub = pd.DataFrame( data = {'index': test['index'],
                             'lang_id': y_pred_dt })
sub.to_csv('submission_dt.csv', index = False, quoting = 3)

#MultinomialNB
m = models['MultinomialNB']
y_pred_m = m.predict(t)
sub = pd.DataFrame( data = {'index': test['index'],
                             'lang_id': y_pred_m })
sub.to_csv('submission_m.csv', index = False, quoting = 3)

#Linear SCV
l_scv = models['Linear SVC']
y_pred_l_scv = l_scv.predict(t)
sub = pd.DataFrame( data = {'index': test['index'],
                             'lang_id': y_pred_l_scv })
sub.to_csv('submission_l_svc.csv', index = False, quoting = 3)

#XGBoost
xg = models['XG Boost']
y_pred_xg = xg.predict(t)
sub = pd.DataFrame( data = {'index': test['index'],
                             'lang_id': y_pred_xg })
sub.to_csv('submission_xg.csv', index = False, quoting = 3)

# Hyperparameter Tuning

In [None]:
#MultinomialNB Hyperparameter tuning
tfid = TfidfVectorizer()
text = tfid.fit_transform(train['text'])
X_train_h, X_test_h, y_train_h, y_test_h = train_test_split(text,y, test_size = 0.2, random_state = 10)
params = {'alpha':[1,0.1,0.01,0.001,0.0001,0.00001]}

grid_MNB = GridSearchCV(MultinomialNB(), params)
grid_MNB.fit(X_train_h, y_train_h)
print(grid_MNB.best_params_)

In [None]:
# MultinomialNB using the hyperparameter
multi = Pipeline([('tfid', TfidfVectorizer()),
             ('clf', MultinomialNB(alpha = 0.1))])
multi.fit(X_train, y_train)
t = test['text']
y_pred_m = multi.predict(t)
sub = pd.DataFrame( data = {'index': test['index'],
                             'lang_id': y_pred_m })
sub.to_csv('submission_m.csv', index = False, quoting = 3)