In [None]:
# Importing comet_ml
import comet_ml
from comet_ml import Experiment
#from comet_ml.integration.pytorch import log_model

# Creating an experiment with api key

experiment = Experiment(
  api_key = "Z3zS1quxrC1NIIK61rNNhVKdg",
  project_name = "Language Classification",
  workspace="sorach-roshe"
)
# suppress cell warnings
import warnings
warnings.filterwarnings("ignore")



In [None]:
# utilities
# Importing the Libraries 
import re  # for pattern matching and manipulation of strings.
import numpy as np  # used for numerical computations and data manipulation.
import pandas as pd  # for data manipulation and analysis.
import string  # for string manipulation tasks.

# For plotting
import seaborn as sns  # for data visualization.
from wordcloud import WordCloud  # used to generate word clouds.
import matplotlib.pyplot as plt  # used for plotting data.

# nltk
from nltk.stem import WordNetLemmatizer  # used for lemmatizing words
from nltk.tokenize import TreebankWordTokenizer  # used for tokenizing sentences into words.
from nltk import SnowballStemmer  # used for stemming words.

# sklearn
from sklearn.svm import LinearSVC  # used for solving linear classification problems. 
from sklearn.naive_bayes import BernoulliNB  # implementation of the Naive Bayes algorithm.
from sklearn.linear_model import LogisticRegression  # implementation of logistic regression.  
from sklearn.model_selection import train_test_split  # for splitting a dataset into training and testing subsets.
from sklearn.feature_extraction.text import TfidfVectorizer  # for converting text documents into a numerical representation.
from sklearn.metrics import confusion_matrix, classification_report  # for evaluating the performance of a classification model
from sklearn.ensemble import VotingClassifier, RandomForestClassifier
from sklearn import preprocessing  # for data preprocessing
from sklearn.utils import resample
from sklearn.model_selection import GridSearchCV
import timeit
# suppress cell warnings
import warnings
warnings.filterwarnings("ignore")

In [None]:
df = pd.read_csv("train_set.csv") # reads a CSV file named "train.csv" and stores it in Pandas df.
df_test = pd.read_csv("test_set.csv") # reads "test_with_no_labels.csv" and stores it in Pandas df_test.

In [None]:
df.head()

In [None]:
df_test.head(20)

In [None]:
df.info()

In [None]:
df_test.info()

In [None]:
np.sum(df.isnull().any(axis=1))

In [None]:
df.duplicated() 

In [None]:
df.drop_duplicates()

In [None]:
df.dtypes

In [None]:
df.describe()

In [None]:
# Checking the shape of the datasets
print(f'The shape of the train dataset: {df.shape}\nThe shape of the test dataset: {df_test.shape}')

In [None]:
df['lang_id'].unique()

In [None]:
df['lang_id'].value_counts()

In [None]:
# To creates a countplot
sns.countplot(x='lang_id', data=df)

In [None]:
# Plotting the distribution for dataset.
ax = df.groupby('lang_id').count().plot(kind='bar', title='Distribution of data',legend=False)
ax.set_xticklabels(['xho','eng','nso','ven', 'tsn', 'nbl', 'zul', 'ssw', 'tso', 'sot', 'afr'], rotation=0)
# Storing data in lists.
text, lang_id = list(df['text']), list(df['lang_id'])

In [None]:
 #Separating languages by number
data_1 = df[df['lang_id'] == 'xho']
data_2 = df[df['lang_id'] == 'eng']
data_3 = df[df['lang_id'] == 'nso' ]
data_4 = df[df['lang_id'] == 'ven']
data_5 = df[df['lang_id'] == 'tsn']
data_6 = df[df['lang_id'] == 'nbl']
data_7 = df[df['lang_id'] == 'zul']
data_8 = df[df['lang_id'] == 'ssw']
data_9 = df[df['lang_id'] == 'tso']
data_10 = df[df['lang_id'] == 'sot']
data_11 = df[df['lang_id'] == 'afr']

In [None]:
# Combining all
dataset = pd.concat([data_1, data_2, data_3, data_4, data_5, data_6, data_7, data_8, data_9, data_10, data_11])

In [None]:
# Change to lowercase
dataset['text']=dataset['text'].str.lower()
df_test['text'] = df_test['text'].str.lower()
dataset['text'].tail()

In [None]:
# remove punctuation
def remove_punctuation(post):
    return ''.join([l for l in post if l not in string.punctuation])

In [None]:
dataset['text'] = dataset['text'].apply(remove_punctuation)
df_test['text'] = df_test['text'].apply(remove_punctuation)

In [None]:
# Tokenise
tokeniser = TreebankWordTokenizer()
dataset['tokens'] = dataset['text'].apply(tokeniser.tokenize)

In [None]:
# Lemmatization
lemmatizer = WordNetLemmatizer()

In [None]:
# Separating input feature and label
X = dataset.text
y = dataset.lang_id

In [None]:
# Separating the 80% data for training data and 20% for testing data
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2, random_state = 42)

In [None]:
# Transforming the Dataset Using TF-IDF Vectorizer
vectoriser = TfidfVectorizer(ngram_range=(1,3), max_features=500000)
vectoriser.fit(X_train)

In [None]:
X_train_vect = vectoriser.transform(X_train)
X_test_vect = vectoriser.transform(X_test)

In [None]:
test_X = vectoriser.transform(df_test.text)

In [None]:
def model_Evaluate(model):
    # Predict values for Test dataset
    y_pred = model.predict(X_test_vect)
    # Print the evaluation metrics for the dataset.
    print(classification_report(y_test, y_pred))

In [None]:
# Bernoulli model
BNBmodel = BernoulliNB()
BNBmodel.fit(X_train_vect, y_train)
model_Evaluate(BNBmodel)
y_pred1 = BNBmodel.predict(X_test_vect)

In [None]:
df_test = df_test.iloc[:len(y_pred1)]

In [None]:
y_pred1 = y_pred1[:len(df_test)]

In [None]:
results_df_bern = pd.DataFrame({'index': df_test['index'], 'lang_id': y_pred1})
results_df_bern.to_csv('results_df_bern.csv', index=False)
results_df_bern.head()

In [None]:
results_df_bern.info()

In [None]:
# SVC model
SVCmodel = LinearSVC()
SVCmodel.fit(X_train_vect, y_train)
model_Evaluate(SVCmodel)
y_pred2 = SVCmodel.predict(X_test_vect)

In [None]:
y_pred2 = y_pred1[:len(df_test)]

In [None]:
results_df_svc = pd.DataFrame({'index': df_test['index'], 'lang_id': y_pred2})
results_df_svc.to_csv('results_df_svc.csv', index=False)
results_df_svc.head()

In [None]:
experiment.end()