In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report, f1_score
from sklearn.preprocessing import LabelEncoder


In [8]:
train_df = pd.read_csv('train_set.csv')
test_df = pd.read_csv('test_set.csv')

In [None]:
train_df.head()

In [None]:
# Get the count of each language
language_counts = train_df['lang_id'].value_counts()

# Display the counts
print(language_counts)

In [9]:
#Lowercasing
train_df['text'] = train_df['text'].str.lower()
test_df['text'] = test_df['text'].str.lower()

# Removing Punctuation
train_df['text'] = train_df['text'].str.replace('[^\w\s]', '')
test_df['text'] = test_df['text'].str.replace('[^\w\s]', '')

# Removing Numbers
train_df['text'] = train_df['text'].str.replace('\d+', '')
test_df['text'] = test_df['text'].str.replace('\d+', '')


In [None]:
# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(train_df['text'], train_df['lang_id'], test_size=0.2, random_state=42)

# Initialize and train the SVM model
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2))
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_val_tfidf = tfidf_vectorizer.transform(X_val)

svm_model = SVC(kernel='linear', C=1)
svm_model.fit(X_train_tfidf, y_train)

# Make predictions on the validation set
y_pred = svm_model.predict(X_val_tfidf)

# Evaluate the model
print("Classification Report:")
print(classification_report(y_val, y_pred))
print("Mean F1-Score:", f1_score(y_val, y_pred, average='micro'))


In [10]:
from sklearn.naive_bayes import MultinomialNB
# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the TF-IDF vectorizer for unigrams and bigrams
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2))

# Fit and transform the TF-IDF vectorizer on the training set
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Transform the validation set using the same TF-IDF vectorizer
X_val_tfidf = tfidf_vectorizer.transform(X_val)

# Initialize and train the Multinomial Naive Bayes model
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)

# Make predictions on the validation set
y_pred_nb = nb_model.predict(X_val_tfidf)

# Evaluate the model
print("Naive Bayes Classification Report:")
print(classification_report(y_val, y_pred_nb))
print("Mean F1-Score (Naive Bayes):", f1_score(y_val, y_pred_nb, average='micro'))

Naive Bayes Classification Report:
              precision    recall  f1-score   support

         afr       1.00      1.00      1.00       583
         eng       0.99      1.00      1.00       615
         nbl       0.99      0.99      0.99       583
         nso       1.00      1.00      1.00       625
         sot       1.00      1.00      1.00       618
         ssw       1.00      1.00      1.00       584
         tsn       1.00      1.00      1.00       598
         tso       1.00      1.00      1.00       561
         ven       1.00      1.00      1.00       634
         xho       1.00      1.00      1.00       609
         zul       1.00      0.99      0.99       590

    accuracy                           1.00      6600
   macro avg       1.00      1.00      1.00      6600
weighted avg       1.00      1.00      1.00      6600

Mean F1-Score (Naive Bayes): 0.9977272727272727


In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV

# Get the unique classes from the training set
unique_classes = y_train.unique()

# Define a range of hyperparameters to search
param_grid = {
    'alpha': [0.1, 0.5, 1.0, 1.5, 2.0],
    'fit_prior': [True, False],
    'class_prior': [None] + [1/len(unique_classes)] * len(unique_classes),  # Default and uniform class priors
}

# Initialize the Multinomial Naive Bayes model
nb_model = MultinomialNB()

# Initialize GridSearchCV
grid_search = GridSearchCV(nb_model, param_grid, cv=5, scoring='f1_macro', n_jobs=-1)

# Fit the model to the training data
grid_search.fit(X_train_tfidf, y_train)

# Print the best hyperparameters
print("Best Hyperparameters:", grid_search.best_params_)

# Make predictions on the validation set using the best model
y_pred_nb = grid_search.best_estimator_.predict(X_val_tfidf)

# Evaluate the model
print("Naive Bayes Classification Report:")
print(classification_report(y_val, y_pred_nb))
print("Mean F1-Score (Naive Bayes):", f1_score(y_val, y_pred_nb, average='macro'))


In [None]:
from sklearn.ensemble import RandomForestClassifier

# Initialize and train the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_tfidf, y_train)

# Make predictions on the validation set
y_pred_rf = rf_model.predict(X_val_tfidf)

# Evaluate the model
print("Random Forest Classification Report:")
print(classification_report(y_val, y_pred_rf))
print("Mean F1-Score (Random Forest):", f1_score(y_val, y_pred_rf, average='micro'))


In [None]:
from sklearn.linear_model import LogisticRegression

# Initialize and train the Logistic Regression model
lr_model = LogisticRegression(max_iter=100, random_state=42, multi_class='ovr')
lr_model.fit(X_train_tfidf, y_train)

# Make predictions on the validation set
y_pred_lr = lr_model.predict(X_val_tfidf)

# Evaluate the model
print("Logistic Regression Classification Report:")
print(classification_report(y_val, y_pred_lr))
print("Mean F1-Score (Logistic Regression):", f1_score(y_val, y_pred_lr, average='macro'))


In [None]:
# Transform the test set using the same TF-IDF vectorizer
X_test_tfidf = tfidf_vectorizer.transform(test_df['text'])

# Make predictions on the test set
test_predictions = svm_model.predict(X_test_tfidf)

# Create a submission DataFrame
submission_df = pd.DataFrame({'index': test_df['index'], 'lang_id': test_predictions})

# Save the submission file
submission_df.to_csv('submission_svm.csv', index=False)

In [11]:
# Assuming you have the 'text' column in your test DataFrame
X_test = test_df['text']

# Transform the test set using the same TF-IDF vectorizer
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Make predictions on the test set using Naive Bayes
test_predictions_nb = nb_model.predict(X_test_tfidf)

# Create a submission DataFrame
submission_df_nb = pd.DataFrame({'index': test_df['index'], 'lang_id': test_predictions_nb})

# Save the submission file for Naive Bayes
submission_df_nb.to_csv('submission_nb_with_ngrams.csv', index=False)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
import pandas as pd

# Assuming you have the 'text' column in your DataFrame
X = train_df['text']
y = train_df['lang_id']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the TF-IDF vectorizer for unigrams and bigrams
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2))

# Fit and transform the TF-IDF vectorizer on the training set
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Transform the validation set using the same TF-IDF vectorizer
X_val_tfidf = tfidf_vectorizer.transform(X_val)

# Extract the length of the text as a feature
X_train_length = X_train.apply(len)
X_val_length = X_val.apply(len)

# Stack the TF-IDF matrix and the text length feature horizontally
X_train_final = hstack([X_train_tfidf, X_train_length.values.reshape(-1, 1)])

# Initialize and train the Multinomial Naive Bayes model
nb_model = MultinomialNB(alpha=0.1, class_prior=None, fit_prior=True)
nb_model.fit(X_train_final, y_train)

# Transform the test set using the same TF-IDF vectorizer
X_test_tfidf = tfidf_vectorizer.transform(test_df['text'])

# Extract the length of the text as a feature for the test set
X_test_length = test_df['text'].apply(len)

# Stack the TF-IDF matrix and the text length feature horizontally for the test set
X_test_final = hstack([X_test_tfidf, X_test_length.values.reshape(-1, 1)])

# Make predictions on the test set using Naive Bayes
test_predictions_nb = nb_model.predict(X_test_final)

# Create a submission DataFrame
submission_df_nb = pd.DataFrame({'index': test_df['index'], 'lang_id': test_predictions_nb})

# Save the submission file for Naive Bayes
submission_df_nb.to_csv('submission_nb_with_features.csv', index=False)


In [None]:
# Transform the test set using the same TF-IDF vectorizer
X_test_tfidf_rf = tfidf_vectorizer.transform(test_df['text'])

# Make predictions on the test set using Random Forest
test_predictions_rf = rf_model.predict(X_test_tfidf_rf)

# Create a submission DataFrame
submission_df_rf = pd.DataFrame({'index': test_df['index'], 'lang_id': test_predictions_rf})

# Save the submission file for Random Forest
submission_df_rf.to_csv('submission_rf.csv', index=False)


In [None]:
# Transform the test set using the same TF-IDF vectorizer
X_test_tfidf_lr = tfidf_vectorizer.transform(test_df['text'])

# Make predictions on the test set using Logistic Regression
test_predictions_lr = lr_model.predict(X_test_tfidf_lr)

# Create a submission DataFrame
submission_df_lr = pd.DataFrame({'index': test_df['index'], 'lang_id': test_predictions_lr})

# Save the submission file for Logistic Regression
submission_df_lr.to_csv('submission_lr.csv', index=False)
