# 1. Data Preprocessing

In [None]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split

# Load Training and Testing Data
train_data = pd.read_csv("Tulu_SA_train.csv")  # Replace with actual file path
test_data_with_labels = pd.read_csv("Tulu_SA_val.csv")  # Testing data with labels
unlabeled_test_data = pd.read_csv("Tulu_SA_test_without_label.csv")  # Testing data without labels

# Preprocessing Function
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'http\S+|www\S+', '', text)  # Remove URLs
    text = re.sub(r'@\w+|\#', '', text)  # Remove mentions and hashtags
    text = re.sub(r'[^\w\s]', '', text)  # Remove special characters
    return text

# Apply Preprocessing
train_data['Text'] = train_data['Text'].apply(preprocess_text)
test_data_with_labels['Text'] = test_data_with_labels['Text'].apply(preprocess_text)
unlabeled_test_data['Text'] = unlabeled_test_data['Text'].apply(preprocess_text)

# Encode Labels for Training and Testing Data
train_data['Label'] = train_data['Label'].map({'Positive': 1, 'Negative': 0})
test_data_with_labels['Label'] = test_data_with_labels['Label'].map({'Positive': 1, 'Negative': 0})


# 2. Splitting Training Data

In [None]:
# Remove rows with NaN values in 'Label' column from original train_data
train_data = train_data.dropna(subset=['Label'])

# Re-split data after handling NaNs
X_train, X_val, y_train, y_val = train_test_split(
    train_data['Text'], train_data['Label'], test_size=0.2, random_state=42
)


# 3. Feature Extraction

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))  # Unigrams and bigrams
X_train_vec = vectorizer.fit_transform(X_train)  # Fit and transform training data
X_val_vec = vectorizer.transform(X_val)         # Transform validation data
X_test_vec = vectorizer.transform(test_data_with_labels['Text'])  # Transform labeled test data
X_unlabeled_vec = vectorizer.transform(unlabeled_test_data['Text'])  # Transform unlabeled test data

# 4. Model Training using Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Train Logistic Regression Model
model = LogisticRegression()
model.fit(X_train_vec, y_train)

# Evaluate on Validation Data
y_val_pred = model.predict(X_val_vec)
print("Validation Set Performance:")
print(classification_report(y_val, y_val_pred))

Validation Set Performance:
              precision    recall  f1-score   support

         0.0       0.93      0.39      0.55       175
         1.0       0.87      0.99      0.93       748

    accuracy                           0.88       923
   macro avg       0.90      0.69      0.74       923
weighted avg       0.88      0.88      0.86       923



# 5. Model Evaluation on Labled Data

In [None]:
test_data_with_labels = test_data_with_labels.dropna(subset=['Label'])
y_test_pred = model.predict(X_test_vec)

# Now you need to filter X_test_vec to align with the rows removed from test_data_with_labels
# This is necessary because the predictions in y_test_pred correspond to the original X_test_vec
X_test_vec_filtered = X_test_vec[test_data_with_labels.index]

print("Test Set Performance:")
print(classification_report(test_data_with_labels['Label'], y_test_pred[test_data_with_labels.index]))

Test Set Performance:
              precision    recall  f1-score   support

         0.0       0.86      0.32      0.47       118
         1.0       0.85      0.99      0.92       470

    accuracy                           0.85       588
   macro avg       0.86      0.65      0.69       588
weighted avg       0.86      0.85      0.83       588



# 6. Predict Sentiment for unlabled data

In [None]:
unlabeled_test_data['Predicted_Label'] = model.predict(X_unlabeled_vec)
unlabeled_test_data['Predicted_Label'] = unlabeled_test_data['Predicted_Label'].map({1: 'Positive', 0: 'Negative'})

# Save Predictions to CSV
unlabeled_test_data[['Id', 'Predicted_Label']].to_csv("predictions.csv", index=False)
print("Predictions saved to predictions.csv")

Predictions saved to predictions.csv


In [7]:
import pandas as pd

# Load the CSV file into a DataFrame
df = pd.read_csv('predictions.csv')

# Rename the 'Predicted_Label' column to 'Labels'
df.rename(columns={'Predicted_Label': 'Labels'}, inplace=True)

# Save the updated DataFrame back to a CSV file
df.to_csv('updated_file.csv', index=False)

In [8]:
from google.colab import files
files.download('updated_file.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import csv
with open('predictions.csv') as f:
  reader = csv.reader(f)
  for row in reader:
    print(row)

['Id', 'Predicted_Label']
['SA_TU_01', 'Positive']
['SA_TU_02', 'Positive']
['SA_TU_03', 'Positive']
['SA_TU_04', 'Negative']
['SA_TU_05', 'Positive']
['SA_TU_06', 'Positive']
['SA_TU_07', 'Positive']
['SA_TU_08', 'Negative']
['SA_TU_09', 'Positive']
['SA_TU_10', 'Positive']
['SA_TU_11', 'Positive']
['SA_TU_12', 'Positive']
['SA_TU_13', 'Positive']
['SA_TU_14', 'Positive']
['SA_TU_15', 'Positive']
['SA_TU_16', 'Positive']
['SA_TU_17', 'Positive']
['SA_TU_18', 'Positive']
['SA_TU_19', 'Negative']
['SA_TU_20', 'Positive']
['SA_TU_21', 'Positive']
['SA_TU_22', 'Positive']
['SA_TU_23', 'Positive']
['SA_TU_24', 'Positive']
['SA_TU_25', 'Positive']
['SA_TU_26', 'Positive']
['SA_TU_27', 'Positive']
['SA_TU_28', 'Positive']
['SA_TU_29', 'Positive']
['SA_TU_30', 'Positive']
['SA_TU_31', 'Positive']
['SA_TU_32', 'Positive']
['SA_TU_33', 'Positive']
['SA_TU_34', 'Positive']
['SA_TU_35', 'Positive']
['SA_TU_36', 'Positive']
['SA_TU_37', 'Positive']
['SA_TU_38', 'Positive']
['SA_TU_39', 'Positive']