In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.metrics import classification_report,accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier

In [11]:
train_set=pd.read_csv('/content/drive/MyDrive/CoLi-Dravidian_2024/mal_train.csv')
val_set=pd.read_csv('/content/drive/MyDrive/CoLi-Dravidian_2024/mal_val.csv')

In [12]:
train_set.head()

Unnamed: 0,Word,Tag
0,Supr,ENGLISH
1,trailer,ENGLISH
2,oru,MALAYALAM
3,vettame,MALAYALAM
4,like,ENGLISH


In [14]:
!pip install word2number
!pip install inflect
import pandas as pd
import inflect
from word2number import w2n

p = inflect.engine()

def number_to_words(number_text):
    try:
        number_value = w2n.word_to_num(number_text)
        word_format = p.number_to_words(number_value)
        return word_format
    except ValueError:
        return number_text


df_train = pd.DataFrame(train_set)
df_val=pd.DataFrame(val_set)

df_train['Word'] = df_train.apply(lambda row: number_to_words(row['Word']) if row['Tag'] == 'NUMBER' else row['Word'], axis=1)
df_val['Word'] = df_val.apply(lambda row: number_to_words(row['Word']) if row['Tag'] == 'NUMBER' else row['Word'], axis=1)



In [16]:
df_train.to_csv('/content/drive/MyDrive/CoLi-Dravidian_2024/preprocess_num_word_train.csv')
df_val.to_csv('/content/drive/MyDrive/CoLi-Dravidian_2024/preprocess_num_word_val.csv')

**TF-IDF with Romanized Char [ TF-IDF of char n-grams of range (1,5) ]**

In [19]:
vectorizer = TfidfVectorizer(ngram_range=(1,5), analyzer= 'char')
X = vectorizer.fit(df_train['Word'])
train_tfidf=vectorizer.transform(df_train['Word'])
dev_tfidf=vectorizer.transform(df_val['Word'])

**ML Model Building : SVM, LSVC, LR, DT, kNN classifiers trained with romanized char TF-IDF**

In [20]:
clf1= svm.SVC()
clf1.fit(train_tfidf, df_train['Tag'])
val_pred1=clf1.predict(dev_tfidf)

In [21]:
print(classification_report(df_val['Tag'], val_pred1))

              precision    recall  f1-score   support

     ENGLISH       0.93      0.93      0.93       538
   MALAYALAM       0.91      0.99      0.95      1175
       MIXED       0.82      0.43      0.56        63
        NAME       0.89      0.85      0.87       169
      NUMBER       0.94      0.97      0.96        77
       OTHER       0.88      0.55      0.67       179
       PLACE       1.00      0.56      0.71         9
         SYM       1.00      0.99      1.00       294

    accuracy                           0.92      2504
   macro avg       0.92      0.78      0.83      2504
weighted avg       0.92      0.92      0.92      2504



In [22]:
clf2=LogisticRegression(solver='liblinear', multi_class='ovr')
clf2.fit(train_tfidf, df_train['Tag'])
val_pred2=clf2.predict(dev_tfidf)

In [23]:
print(classification_report(df_val['Tag'], val_pred2))

              precision    recall  f1-score   support

     ENGLISH       0.89      0.90      0.90       538
   MALAYALAM       0.89      0.99      0.93      1175
       MIXED       0.79      0.17      0.29        63
        NAME       0.88      0.81      0.85       169
      NUMBER       0.94      0.95      0.94        77
       OTHER       0.81      0.46      0.59       179
       PLACE       1.00      0.44      0.62         9
         SYM       1.00      0.99      1.00       294

    accuracy                           0.90      2504
   macro avg       0.90      0.72      0.76      2504
weighted avg       0.89      0.90      0.89      2504



In [24]:
clf3 = MultinomialNB(force_alpha=True)
clf3.fit(train_tfidf, df_train['Tag'])
val_pred3=clf3.predict(dev_tfidf)

In [25]:
print(classification_report(df_val['Tag'], val_pred3))

              precision    recall  f1-score   support

     ENGLISH       0.88      0.85      0.87       538
   MALAYALAM       0.82      0.99      0.90      1175
       MIXED       0.00      0.00      0.00        63
        NAME       0.85      0.72      0.78       169
      NUMBER       0.97      0.73      0.83        77
       OTHER       0.92      0.40      0.56       179
       PLACE       0.00      0.00      0.00         9
         SYM       1.00      0.99      1.00       294

    accuracy                           0.87      2504
   macro avg       0.68      0.59      0.62      2504
weighted avg       0.85      0.87      0.84      2504



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [26]:
clf4 = DecisionTreeClassifier(random_state=0)
clf4.fit(train_tfidf, df_train['Tag'])
val_pred4=clf4.predict(dev_tfidf)

In [27]:
print(classification_report(df_val['Tag'], val_pred4))

              precision    recall  f1-score   support

     ENGLISH       0.91      0.92      0.92       538
   MALAYALAM       0.93      0.96      0.94      1175
       MIXED       0.55      0.41      0.47        63
        NAME       0.83      0.87      0.85       169
      NUMBER       0.89      0.91      0.90        77
       OTHER       0.77      0.59      0.66       179
       PLACE       0.70      0.78      0.74         9
         SYM       1.00      0.99      1.00       294

    accuracy                           0.91      2504
   macro avg       0.82      0.80      0.81      2504
weighted avg       0.90      0.91      0.90      2504



In [28]:
clf5 = KNeighborsClassifier(n_neighbors=3)
clf5.fit(train_tfidf, df_train['Tag'])
val_pred5=clf5.predict(dev_tfidf)

In [29]:
print(classification_report(df_val['Tag'], val_pred5))

              precision    recall  f1-score   support

     ENGLISH       0.90      0.90      0.90       538
   MALAYALAM       0.93      0.97      0.95      1175
       MIXED       0.61      0.27      0.37        63
        NAME       0.85      0.86      0.86       169
      NUMBER       0.82      0.97      0.89        77
       OTHER       0.77      0.60      0.68       179
       PLACE       0.83      0.56      0.67         9
         SYM       1.00      0.99      1.00       294

    accuracy                           0.91      2504
   macro avg       0.84      0.77      0.79      2504
weighted avg       0.90      0.91      0.90      2504



In [34]:
voting = VotingClassifier(
    estimators=[('SVM', clf2), ('DecisionTreeClassifier', clf4),('kNN', clf5)],
    voting='hard')

In [35]:
voting.fit(train_tfidf, df_train['Tag'])

In [37]:
voting_pred= voting.predict(dev_tfidf)
print(classification_report(df_val['Tag'], voting_pred))

              precision    recall  f1-score   support

     ENGLISH       0.89      0.93      0.91       538
   MALAYALAM       0.91      0.98      0.95      1175
       MIXED       0.74      0.22      0.34        63
        NAME       0.88      0.84      0.86       169
      NUMBER       0.94      0.95      0.94        77
       OTHER       0.88      0.56      0.68       179
       PLACE       1.00      0.56      0.71         9
         SYM       1.00      0.99      1.00       294

    accuracy                           0.91      2504
   macro avg       0.90      0.75      0.80      2504
weighted avg       0.91      0.91      0.90      2504



In [38]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.5-cp310-cp310-manylinux2014_x86_64.whl (98.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.5


In [39]:
from catboost import CatBoostClassifier
model = CatBoostClassifier(iterations=1000,  # Number of trees (default is 1000)
                           learning_rate=0.1,  # Learning rate (default is 0.03)
                           depth=6,  # Depth of the trees (default is 6)
                           loss_function='MultiClass',  # Loss function (default is 'Logloss')
                           verbose=100  # Controls the level of verbosity (default is 100)
                           )

# Train the model
model.fit(train_tfidf, df_train['Tag'], eval_set=(dev_tfidf, df_val['Tag']), early_stopping_rounds=50, verbose=100)

# Predictions on the val set
y_pred = model.predict(dev_tfidf)

print("\n", classification_report(df_val['Tag'], y_pred))

0:	learn: 1.7937670	test: 1.7791711	best: 1.7791711 (0)	total: 2.76s	remaining: 46m
100:	learn: 0.6095413	test: 0.5817713	best: 0.5817713 (100)	total: 2m 36s	remaining: 23m 11s
200:	learn: 0.5040708	test: 0.4861579	best: 0.4861579 (200)	total: 5m 7s	remaining: 20m 21s
300:	learn: 0.4583066	test: 0.4491617	best: 0.4491617 (300)	total: 7m 35s	remaining: 17m 37s
400:	learn: 0.4314657	test: 0.4289790	best: 0.4289790 (400)	total: 10m 4s	remaining: 15m 3s
500:	learn: 0.4121518	test: 0.4134887	best: 0.4134887 (500)	total: 12m 35s	remaining: 12m 32s
600:	learn: 0.3992387	test: 0.4047336	best: 0.4047242 (599)	total: 15m 3s	remaining: 9m 59s
700:	learn: 0.3885762	test: 0.3975202	best: 0.3975202 (700)	total: 17m 35s	remaining: 7m 30s
800:	learn: 0.3815967	test: 0.3930757	best: 0.3930757 (800)	total: 20m 5s	remaining: 4m 59s
900:	learn: 0.3749738	test: 0.3894284	best: 0.3894284 (900)	total: 22m 35s	remaining: 2m 28s
999:	learn: 0.3685071	test: 0.3860501	best: 0.3860501 (999)	total: 25m 2s	remainin

In [40]:
#Submission
# to create .csv file consisting of Word and Tag as two columns
y_pred = pd.DataFrame(data=val_pred1, columns=['final_prediction'])
Submisssion_mal= pd.DataFrame()
Submisssion_mal['Word'] = val_set['Word']
Submisssion_mal['Tag'] = y_pred
Submisssion_mal.to_csv('/content/drive/MyDrive/CoLi-Dravidian_2024/predictions.csv',index = None)