In [3]:
#EDA
import pandas as pd

train_df = pd.read_csv('train_set.csv')
test_df = pd.read_csv('test_set.csv')

In [4]:
#Data Preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


In [5]:
print(train_df.columns)


Index(['lang_id', 'text'], dtype='object')


In [6]:
# Data cleaning
X_train, X_val, y_train, y_val = train_test_split(train_df['text'], train_df['lang_id'], test_size=0.2, random_state=42)

In [7]:
# Feature Engineering
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_train_tfidf = tfidf.fit_transform(X_train)
X_val_tfidf = tfidf.transform(X_val)

In [8]:
# Step 4: Model Development
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

rf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train_tfidf, y_train)

best_rf = grid_search.best_estimator_
print("Best Parameters:", grid_search.best_params_)








Fitting 3 folds for each of 16 candidates, totalling 48 fits
Best Parameters: {'max_depth': 20, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 200}


In [9]:
# Step 5: Model Evaluation
y_pred = best_rf.predict(X_val_tfidf)
f1 = f1_score(y_val, y_pred, average='weighted')
print(f"F1 Score: {f1}")

F1 Score: 0.9309169969809116


In [11]:
# Step 6: Generate Submission File
test_tfidf = tfidf.transform(test_df['text'])
test_df['lang_ID'] = best_rf.predict(test_tfidf)

submission_df = test_df[['index', 'lang_ID']]
submission_df.to_csv('submission.csv', index=False)

In [12]:
# Display the submission DataFrame
from IPython.display import display
display(submission_df.head())

Unnamed: 0,index,lang_ID
0,1,nbl
1,2,nbl
2,3,ven
3,4,ssw
4,5,nbl
