In [None]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE
from nltk.corpus import stopwords
import nltk
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import Pipeline


nltk.download('stopwords')

# Define Tulu stopwords (example list; replace with actual Tulu stopwords)
stopwords_tulu = set(['ಅವರು', 'ಇದು', 'ಆಗ', 'ನಾನು', 'ಅಲ್ಲಿ', 'ಅದನ್ನು', 'ಎಂದೆರೆ', 'ಅದೇ', 'ಅಂತಾ', 'ಅದು', 'ಮತ್ತು'])

# Combine English and Tulu stopwords
stopwords_english = set(stopwords.words('english'))
stopwords_combined = stopwords_english.union(stopwords_tulu)

# Preprocessing function for Tulu text
def preprocess_tulu(text):
    if pd.isnull(text):
        return ""

    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\u0C80-\u0CFF\s]', '', text)
    text_tokens = text.split()
    text_tokens = [word for word in text_tokens if word not in stopwords_combined]

    return ' '.join(text_tokens)

# Load datasets
df_train = pd.read_csv("/content/Tulu_SA_train.csv")
df_valid = pd.read_csv("/content/Tulu_SA_val.csv")
df_test = pd.read_csv("/content/Tulu_SA_test_without_label.csv")


df_test['Original_Text'] = df_test['Text']

# Preprocessing step for all datasets
df_train['Text'] = df_train['Text'].apply(preprocess_tulu)
df_valid['Text'] = df_valid['Text'].apply(preprocess_tulu)
df_test['Text'] = df_test['Text'].apply(preprocess_tulu)


df_train['Text'] = df_train['Text'].replace('', 'empty_text')
df_valid['Text'] = df_valid['Text'].replace('', 'empty_text')
df_test['Text'] = df_test['Text'].replace('', 'empty_text')

# Ensure no NaN in labels
df_train.dropna(subset=['Label'], inplace=True)
df_valid.dropna(subset=['Label'], inplace=True)

# Step 1: Feature Extraction using TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2), min_df=3)
X_train = vectorizer.fit_transform(df_train['Text'])
X_valid = vectorizer.transform(df_valid['Text'])
X_test = vectorizer.transform(df_test['Text'])

# Step 2: Prepare target labels for training and validation
y_train = df_train['Label']
y_valid = df_valid['Label']

# Step 3: Handle Class Imbalance with SMOTE
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Step 4: Hyperparameter Tuning using GridSearchCV
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf']
}
svm = SVC(class_weight='balanced', probability=True)
grid_search = GridSearchCV(svm, param_grid, scoring='f1_weighted', cv=3, verbose=1)
grid_search.fit(X_train_smote, y_train_smote)

best_svm = grid_search.best_estimator_

# Step 5: Validate the model using the validation data
y_valid_pred = best_svm.predict(X_valid)

# Evaluate the model on validation data
print("Best Parameters:", grid_search.best_params_)
print("Validation Accuracy:", accuracy_score(y_valid, y_valid_pred))
print("Validation Classification Report:")
print(classification_report(y_valid, y_valid_pred))

# Calculate AUC-ROC for each class (One-vs-Rest strategy)
y_valid_binarized = label_binarize(y_valid, classes=df_train['Label'].unique())
y_valid_pred_prob = best_svm.predict_proba(X_valid)
roc_auc = roc_auc_score(y_valid_binarized, y_valid_pred_prob, average='macro', multi_class='ovr')
print("Validation AUC-ROC Score:", roc_auc)

# Step 6: Test the model using the test data (No labels in test data)
y_test_pred = best_svm.predict(X_test)

# Add the predictions to the test dataframe
df_test['Predicted_Label'] = y_test_pred

# Save the predictions to a CSV file
df_test[['Id', 'Original_Text', 'Predicted_Label']].to_csv('tulu_test_predictions_svm2.csv', index=False)

# Optionally, print a few predictions
print(df_test[['Id', 'Original_Text', 'Predicted_Label']].head())


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Fitting 3 folds for each of 6 candidates, totalling 18 fits
Best Parameters: {'C': 10, 'kernel': 'rbf'}
Validation Accuracy: 0.6114494518879415
Validation Classification Report:
              precision    recall  f1-score   support

       Mixed       0.28      0.17      0.22       143
    Negative       0.25      0.26      0.26       118
     Neutral       0.55      0.60      0.57       368
    Not Tulu       0.73      0.77      0.75       543
    Positive       0.68      0.66      0.67       470

    accuracy                           0.61      1642
   macro avg       0.50      0.49      0.49      1642
weighted avg       0.60      0.61      0.61      1642

Validation AUC-ROC Score: 0.462704276067869
         Id                                      Original_Text Predicted_Label
0  SA_TU_01              Keep it up bro...... always pukuli🤣🤣🤣        Not Tulu
1  SA_TU_02  Nxt comedy film geppanaga umben seravarle iyav...        Not Tulu
2  SA_TU_03  How Nepal is connected with Mangalore. 