<a href="https://colab.research.google.com/github/Tanushreejaganathan/Sentiment_Analysis_tamil_tulu/blob/main/Tulu_randomforest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

# Load datasets
train_path = '/content/drive/MyDrive/Tulutraincl1.csv'
val_path = '/content/drive/MyDrive/Tuluvalcl1.csv'
test_path = '/content/drive/MyDrive/Tulutestcl1.csv'

train_df = pd.read_csv(train_path)
val_df = pd.read_csv(val_path)
test_df = pd.read_csv(test_path)

# Combine train and validation data for training
data_df = pd.concat([train_df, val_df], ignore_index=True)

# Ensure labels are strings and handle missing labels
data_df['Label'] = data_df['Label'].fillna('Unknown').astype(str)

X_train, y_train = data_df['Text'].fillna(''), data_df['Label']
X_test = test_df['Text'].fillna('')

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))  # Using unigrams and bigrams
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Hyperparameter tuning for Random Forest using GridSearchCV
rf_params = {
    'n_estimators': [100, 200, 300],  # Number of trees in the forest
    'criterion': ['gini', 'entropy'],  # Splitting criteria
    'max_depth': [10, 20, None],       # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],   # Minimum samples required to split an internal node
    'min_samples_leaf': [1, 2, 5]      # Minimum samples required to be at a leaf node
}

rf_model = GridSearchCV(RandomForestClassifier(random_state=42), rf_params, cv=3, scoring='f1_macro', verbose=1, n_jobs=-1)
rf_model.fit(X_train_tfidf, y_train)

# Best parameters and model evaluation
print(f"Best Parameters: {rf_model.best_params_}")
y_pred_train = rf_model.predict(X_train_tfidf)
print("\nTraining Performance:")
print(classification_report(y_train, y_pred_train))

# Predicting for the test dataset
test_df['RandomForest_Predicted_Label'] = rf_model.predict(X_test_tfidf)
test_predictions_path = '/content/drive/MyDrive/test_predictions_rf_tffi.csv'
test_df.to_csv(test_predictions_path, index=False)
print(f"Random Forest predictions saved to {test_predictions_path}")


Fitting 3 folds for each of 162 candidates, totalling 486 fits
Best Parameters: {'criterion': 'gini', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 300}

Training Performance:
              precision    recall  f1-score   support

       Mixed       0.88      0.64      0.75      1257
    Negative       0.91      0.76      0.83       961
     Neutral       0.87      0.85      0.86      3543
    Not Tulu       0.81      0.96      0.88      4943
    Positive       0.89      0.82      0.85      4239
     Unknown       1.00      0.12      0.22         8

    accuracy                           0.85     14951
   macro avg       0.89      0.69      0.73     14951
weighted avg       0.86      0.85      0.85     14951

Random Forest predictions saved to /content/drive/MyDrive/test_predictions_rf_tffi.csv
