# Dataset Viewing

In [1]:
import csv

with open('Tam-SA-train.csv') as f:
  reader = csv.reader(f)
  header_row = next(reader)  # Get the first row (header)
  print(header_row)
  print(next(reader))

['Text', 'Label']
['Ennq pa idhu paei padama twist nalla irkkae', 'Positive']


In [2]:
with open('Tam-SA-val.csv') as f:
  reader = csv.reader(f)
  header_row = next(reader)  # Get the first row (header)
  print(header_row)

['Text', 'Label']


In [None]:
with open('Tam-SA-test-without-labels.csv') as f:
  reader = csv.reader(f)
  header_row = next(reader)  # Get the first row (header)
  print(header_row)

['Id', 'Text']


# 1. Data Preprocessing

In [3]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split

# Load Training and Testing Data
train_data = pd.read_csv("Tam-SA-train.csv")  # Replace with actual file path
test_data_with_labels = pd.read_csv("Tam-SA-val.csv")  # Testing data with labels
unlabeled_test_data = pd.read_csv("Tam-SA-test-without-labels.csv")  # Testing data without labels

# Preprocessing Function
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'http\S+|www\S+', '', text)  # Remove URLs
    text = re.sub(r'@\w+|\#', '', text)  # Remove mentions and hashtags
    text = re.sub(r'[^\w\s]', '', text)  # Remove special characters
    return text

# Apply Preprocessing
train_data['Text'] = train_data['Text'].apply(preprocess_text)
test_data_with_labels['Text'] = test_data_with_labels['Text'].apply(preprocess_text)
unlabeled_test_data['Text'] = unlabeled_test_data['Text'].apply(preprocess_text)

# Encode Labels for Training and Testing Data
train_data['Label'] = train_data['Label'].map({'Positive': 1, 'Negative': 0})
test_data_with_labels['Label'] = test_data_with_labels['Label'].map({'Positive': 1, 'Negative': 0})


# 2. Splitting Training Data

In [4]:
# Remove rows with NaN values in 'Label' column from original train_data
train_data = train_data.dropna(subset=['Label'])

# Re-split data after handling NaNs
X_train, X_val, y_train, y_val = train_test_split(
    train_data['Text'], train_data['Label'], test_size=0.2, random_state=42
)


# 3. Feature Extraction

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

# TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))  # Unigrams and bigrams
X_train_vec = vectorizer.fit_transform(X_train)  # Fit and transform training data
X_val_vec = vectorizer.transform(X_val)         # Transform validation data
X_test_vec = vectorizer.transform(test_data_with_labels['Text'])  # Transform labeled test data
X_unlabeled_vec = vectorizer.transform(unlabeled_test_data['Text'])  # Transform unlabeled test data

# 4. Model Training Using Logistic Regression


In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Train Logistic Regression Model
model = LogisticRegression()
model.fit(X_train_vec, y_train)

# Evaluate on Validation Data
y_val_pred = model.predict(X_val_vec)
print("Validation Set Performance:")
print(classification_report(y_val, y_val_pred))

Validation Set Performance:
              precision    recall  f1-score   support

         0.0       0.72      0.28      0.40       836
         1.0       0.85      0.98      0.91      3624

    accuracy                           0.84      4460
   macro avg       0.79      0.63      0.65      4460
weighted avg       0.83      0.84      0.81      4460



# 5. Model Evaluation on Labeled Data

In [7]:
# Evaluate on Labeled Test Data
# Before calculating the classification_report, remove rows with NaN values from test_data_with_labels['Label']
test_data_with_labels = test_data_with_labels.dropna(subset=['Label'])
y_test_pred = model.predict(X_test_vec)

# Now you need to filter X_test_vec to align with the rows removed from test_data_with_labels
# This is necessary because the predictions in y_test_pred correspond to the original X_test_vec
X_test_vec_filtered = X_test_vec[test_data_with_labels.index]

print("Test Set Performance:")
print(classification_report(test_data_with_labels['Label'], y_test_pred[test_data_with_labels.index]))

Test Set Performance:
              precision    recall  f1-score   support

         0.0       0.72      0.29      0.41       480
         1.0       0.87      0.98      0.92      2272

    accuracy                           0.86      2752
   macro avg       0.79      0.63      0.67      2752
weighted avg       0.84      0.86      0.83      2752



# 6. Predict Sentiment for Unlabled data

In [14]:
# Predict on Unlabeled Test Data
unlabeled_test_data['Predicted_Label'] = model.predict(X_unlabeled_vec)
unlabeled_test_data['Predicted_Label'] = unlabeled_test_data['Predicted_Label'].map({1: 'Positive', 0: 'Negative'})

# Save Predictions to CSV
unlabeled_test_data[['Id', 'Predicted_Label']].to_csv("predictions.csv", index=False)
print("Predictions saved to predictions.csv")

Predictions saved to predictions.csv


In [16]:
import pandas as pd

# Load the CSV file into a DataFrame
df = pd.read_csv('predictions.csv')

# Rename the 'Predicted_Label' column to 'Labels'
df.rename(columns={'Predicted_Label': 'Labels'}, inplace=True)

# Save the updated DataFrame back to a CSV file
df.to_csv('updated_file.csv', index=False)

In [19]:
from google.colab import files
files.download('updated_file.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [18]:
with open('updated_file.csv') as f:
  reader = csv.reader(f)
  for row in reader:
    print(row)

['Id', 'Labels']
['SA_Ta_01', 'Positive']
['SA_Ta_02', 'Positive']
['SA_Ta_03', 'Positive']
['SA_Ta_04', 'Positive']
['SA_Ta_05', 'Negative']
['SA_Ta_06', 'Positive']
['SA_Ta_07', 'Negative']
['SA_Ta_08', 'Positive']
['SA_Ta_09', 'Positive']
['SA_Ta_10', 'Positive']
['SA_Ta_11', 'Positive']
['SA_Ta_12', 'Positive']
['SA_Ta_13', 'Positive']
['SA_Ta_14', 'Positive']
['SA_Ta_15', 'Positive']
['SA_Ta_16', 'Positive']
['SA_Ta_17', 'Positive']
['SA_Ta_18', 'Positive']
['SA_Ta_19', 'Positive']
['SA_Ta_20', 'Negative']
['SA_Ta_21', 'Positive']
['SA_Ta_22', 'Positive']
['SA_Ta_23', 'Positive']
['SA_Ta_24', 'Positive']
['SA_Ta_25', 'Positive']
['SA_Ta_26', 'Positive']
['SA_Ta_27', 'Positive']
['SA_Ta_28', 'Positive']
['SA_Ta_29', 'Positive']
['SA_Ta_30', 'Positive']
['SA_Ta_31', 'Positive']
['SA_Ta_32', 'Positive']
['SA_Ta_33', 'Positive']
['SA_Ta_34', 'Positive']
['SA_Ta_35', 'Positive']
['SA_Ta_36', 'Positive']
['SA_Ta_37', 'Positive']
['SA_Ta_38', 'Positive']
['SA_Ta_39', 'Positive']
['SA_Ta_