## CS583- DMTM - Research Project

### Data Cleaning

In [1]:
# Dataset from blackboard CS583 Fall 2025 DMTM
test_dataset_file_path = "data/test/sample-testdata.xlsx"
train_dataset_file_path = "data/training/training-Obama-Romney-tweets.xlsx"

In [None]:
# converting to pandas dataframe
import pandas as pd
# 3 sheets in the xlsx file, first one is obama tweets, second one is romney tweets, third one is empty
obama_train_df = pd.read_excel(train_dataset_file_path, sheet_name=0)
romney_train_df = pd.read_excel(train_dataset_file_path, sheet_name=1)
obama_test_df = pd.read_excel(test_dataset_file_path, sheet_name=0)
romney_test_df = pd.read_excel(test_dataset_file_path, sheet_name=1)

print("Obama train shape before cleaning:", obama_train_df.shape)
print("Romney train shape before cleaning:", romney_train_df.shape)
print("Obama test shape:", obama_test_df.shape)
print("Romney test shape:", romney_test_df.shape)

Obama train shape before cleaning: (7199, 6)
Romney train shape before cleaning: (7201, 6)
Obama test shape: (7199, 6)
Romney test shape: (3, 2)


In [3]:
obama_train_df.head()

Unnamed: 0.1,Unnamed: 0,date,time,Anootated tweet,Unnamed: 4,Unnamed: 5
0,,,,"1: positive, -1: negative, 0: neutral, 2: mixed",Class,Your class
1,,2012-10-16 00:00:00,10:28:53-05:00,"Kirkpatrick, who wore a baseball cap embroider...",0,
2,,2016-12-10 00:00:00,10:09:00-05:00,Question: If <e>Romney</e> and <e>Obama</e> ha...,2,
3,,2012-10-16 00:00:00,10:04:30-05:00,#<e>obama</e> debates that Cracker Ass Cracker...,1,
4,,2012-10-16 00:00:00,10:00:36-05:00,RT @davewiner Slate: Blame <e>Obama</e> for fo...,2,


In [4]:
# seperating rows and columns of interest ( rows 1 till end except 0) (columns 2, 5,6 only)
obama_train_df1 = obama_train_df.iloc[1:, [ 3, 4]]

# similarly for romney
romney_train_df1 = romney_train_df.iloc[1:, [ 3, 4]]

obama_train_df1.head()

Unnamed: 0,Anootated tweet,Unnamed: 4
1,"Kirkpatrick, who wore a baseball cap embroider...",0
2,Question: If <e>Romney</e> and <e>Obama</e> ha...,2
3,#<e>obama</e> debates that Cracker Ass Cracker...,1
4,RT @davewiner Slate: Blame <e>Obama</e> for fo...,2
5,@Hollivan @hereistheanswer Youre missing the ...,0


In [5]:
# renaming column name for col 1 to tweet and col 2`` to Class and keeping only '-1', '0' and '1' classes
obama_train_df1 = obama_train_df1.rename(columns={obama_train_df1.columns[0]: 'tweet', obama_train_df1.columns[1]: 'Class'})
# converting Class to numeric and keep only -1, 0, 1 for both datasets
obama_train_df1['Class'] = pd.to_numeric(obama_train_df1['Class'], errors='coerce')
obama_train_df1 = obama_train_df1[obama_train_df1['Class'].isin([-1, 0, 1])]

# similarly for romney
romney_train_df1[romney_train_df1.columns[1]] = pd.to_numeric(romney_train_df1[romney_train_df1.columns[1]], errors='coerce')
romney_train_df1 = romney_train_df1[romney_train_df1[romney_train_df1.columns[1]].isin([-1, 0, 1])]
romney_train_df1 = romney_train_df1.rename(columns={romney_train_df1.columns[0]: 'tweet', romney_train_df1.columns[1]: 'Class'})

obama_train_df1.head()

Unnamed: 0,tweet,Class
1,"Kirkpatrick, who wore a baseball cap embroider...",0.0
3,#<e>obama</e> debates that Cracker Ass Cracker...,1.0
5,@Hollivan @hereistheanswer Youre missing the ...,0.0
7,I was raised as a Democrat left the party yea...,-1.0
8,The <e>Obama camp</e> can't afford to lower ex...,0.0


In [6]:
# removing null values from col 'tweet' and 'Class'
obama_train_df2 = obama_train_df1.dropna(subset=['tweet', 'Class'])

# similarly for romney
romney_train_df2 = romney_train_df1.dropna(subset=['tweet', 'Class'])

print("Obama train shape after cleaning:", obama_train_df2.shape)
print("Romney train shape after cleaning:", romney_train_df2.shape)

Obama train shape after cleaning: (5624, 2)
Romney train shape after cleaning: (5648, 2)


In [7]:
# saving cleaned data to csv
obama_train_df2.to_csv("data/training/cleaned_obama_train_data.csv", index=False)
romney_train_df2.to_csv("data/training/cleaned_romney_train_data.csv", index=False)

In [8]:
# checking class distribution
obama_neg, obama_pos, obama_neu = obama_train_df2[obama_train_df2['Class'] == -1], obama_train_df2[obama_train_df2['Class'] == 1], obama_train_df2[obama_train_df2['Class'] == 0]
print("No. of negative samples in Obama train data:", obama_neg.shape[0])
print("No. of neutral samples in Obama train data:", obama_neu.shape[0])
print("No. of positive samples in Obama train data:", obama_pos.shape[0])
print()

# similarly for romney
romney_neg, romney_pos, romney_neu = romney_train_df2[romney_train_df2['Class'] == -1], romney_train_df2[romney_train_df2['Class'] == 1], romney_train_df2[romney_train_df2['Class'] == 0]
print("No. of negative samples in Romney train data:", romney_neg.shape[0])
print("No. of neutral samples in Romney train data:", romney_neu.shape[0])
print("No. of positive samples in Romney train data:", romney_pos.shape[0])

No. of negative samples in Obama train data: 1968
No. of neutral samples in Obama train data: 1977
No. of positive samples in Obama train data: 1679

No. of negative samples in Romney train data: 2893
No. of neutral samples in Romney train data: 1680
No. of positive samples in Romney train data: 1075


In [9]:
# spliting 90 to 10 for train and test by percentage
obama_train_neg, obama_test_neg = obama_neg.iloc[:int(0.9*len(obama_neg))], obama_neg.iloc[int(0.9*len(obama_neg)):]
obama_train_neu, obama_test_neu = obama_neu.iloc[:int(0.9*len(obama_neu))], obama_neu.iloc[int(0.9*len(obama_neu)):]
obama_train_pos, obama_test_pos = obama_pos.iloc[:int(0.9*len(obama_pos))], obama_pos.iloc[int(0.9*len(obama_pos)):]

print("Obama train shape after split:", (obama_train_neg.shape[0] + obama_train_neu.shape[0] + obama_train_pos.shape[0]))
print("Obama test shape after split:", (obama_test_neg.shape[0] + obama_test_neu.shape[0] + obama_test_pos.shape[0]))
print()

Obama train shape after split: 5061
Obama test shape after split: 563



In [10]:
# similarly for romney
romney_train_neg, romney_test_neg = romney_neg.iloc[:int(0.9*len(romney_neg))], romney_neg.iloc[int(0.9*len(romney_neg)):]
romney_train_neu, romney_test_neu = romney_neu.iloc[:int(0.9*len(romney_neu))], romney_neu.iloc[int(0.9*len(romney_neu)):]
romney_train_pos, romney_test_pos = romney_pos.iloc[:int(0.9*len(romney_pos))], romney_pos.iloc[int(0.9*len(romney_pos)):]

print("Romney train shape after split:", (romney_train_neg.shape[0] + romney_train_neu.shape[0] + romney_train_pos.shape[0]))
print("Romney test shape after split:", (romney_test_neg.shape[0] + romney_test_neu.shape[0] + romney_test_pos.shape[0]))
print()

Romney train shape after split: 5082
Romney test shape after split: 566



### Model Training

In [11]:
from sklearn.linear_model import LogisticRegression
from sentence_transformers import SentenceTransformer
import joblib

model = SentenceTransformer('cardiffnlp/twitter-roberta-base-sentiment-latest', device='mps')

obama_train = pd.concat([obama_train_pos, obama_train_neu, obama_train_neg], ignore_index=True)
romney_train = pd.concat([romney_train_pos, romney_train_neu, romney_train_neg], ignore_index=True)

train_df = pd.concat([obama_train, romney_train], ignore_index=True)
train_df = train_df.dropna(subset=['tweet'])

X_train_text = train_df['tweet'].astype(str).tolist()
y_train = train_df['Class'].values # Labels: -1, 0, 1

print("Encoding Training Data...")
X_train = model.encode(X_train_text, show_progress_bar=True)

# multi-class (-1, 0, 1)
print("Training Classifier...")
classifier = LogisticRegression(max_iter=100, class_weight='balanced')
classifier.fit(X_train, y_train)

joblib.dump(classifier, 'logistic_regression_model.pkl')

  from .autonotebook import tqdm as notebook_tqdm
No sentence-transformers model found with name cardiffnlp/twitter-roberta-base-sentiment-latest. Creating a new one with mean pooling.


Encoding Training Data...


Batches: 100%|██████████| 317/317 [00:32<00:00,  9.66it/s]
  raw_prediction = X @ weights.T + intercept  # ndarray, likely C-contiguous
  raw_prediction = X @ weights.T + intercept  # ndarray, likely C-contiguous
  raw_prediction = X @ weights.T + intercept  # ndarray, likely C-contiguous
  grad[:, :n_features] = grad_pointwise.T @ X + l2_reg_strength * weights
  grad[:, :n_features] = grad_pointwise.T @ X + l2_reg_strength * weights
  grad[:, :n_features] = grad_pointwise.T @ X + l2_reg_strength * weights


Training Classifier...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


['logistic_regression_model.pkl']

### Model Testing after first training

In [12]:
import pandas as pd
from sklearn.metrics import accuracy_score, classification_report, f1_score
from sentence_transformers import SentenceTransformer
import joblib

model = SentenceTransformer('cardiffnlp/twitter-roberta-base-sentiment-latest', device='mps')

obama_test = pd.concat([obama_test_pos, obama_test_neu, obama_test_neg], ignore_index=True)
romney_test = pd.concat([romney_test_pos, romney_test_neu, romney_test_neg], ignore_index=True)
test_df = pd.concat([obama_test, romney_test], ignore_index=True)
test_df = test_df.dropna(subset=['tweet'])

X_test_text = test_df['tweet'].astype(str).tolist()
y_test = test_df['Class'].values

print("Encoding Test Data...")
X_test = model.encode(X_test_text, show_progress_bar=True)

classifier = joblib.load('logistic_regression_model.pkl')
predictions = classifier.predict(X_test)

# scores for each class and overall accuracy and f1 score
print("\nClassification Report:")
print(classification_report(y_test, predictions))

print(f"overall accuracy: {accuracy_score(y_test, predictions):.4f}")
print(f"overall f1 score: {f1_score(y_test, predictions, average='weighted'):.4f}")

No sentence-transformers model found with name cardiffnlp/twitter-roberta-base-sentiment-latest. Creating a new one with mean pooling.


Encoding Test Data...


Batches: 100%|██████████| 36/36 [00:03<00:00,  9.39it/s]


Classification Report:
              precision    recall  f1-score   support

        -1.0       0.75      0.63      0.69       487
         0.0       0.55      0.71      0.62       366
         1.0       0.69      0.62      0.65       276

    accuracy                           0.65      1129
   macro avg       0.66      0.65      0.65      1129
weighted avg       0.67      0.65      0.66      1129

overall accuracy: 0.6537
overall f1 score: 0.6564



  ret = a @ b
  ret = a @ b
  ret = a @ b


### Final evaluation: on blackboard test dataset

In [18]:
# testing on the final test dataset

final_testData_no_label_Obama_tweets = pd.read_excel("final-testData-no-label-Obama-tweets.xlsx", sheet_name=0, header=None, names=['tweet_number', 'tweet'])
final_testData_no_label_Romney_tweets = pd.read_excel("final-testData-no-label-Romney-tweets.xlsx", sheet_name=0, header=None, names=['tweet_number', 'tweet'])

print("obama shape:", final_testData_no_label_Obama_tweets.shape)
print("romney shape:", final_testData_no_label_Romney_tweets.shape)

final_testData_no_label_Obama_tweets.head()

obama shape: (1951, 2)
romney shape: (1900, 2)


Unnamed: 0,tweet_number,tweet
0,1,<e>Obama</e> has to maintain his professionali...
1,2,<e>Obama</e> went into the debate swinging and...
2,3,Ditto. I started @247LS 4 years ago. RT @bmorr...
3,4,I absolutely love <e>Obama</e>'s view in <a>im...
4,5,I'm agreeing completely with <e>Obama</e>'s st...


In [23]:
classifier = joblib.load('logistic_regression_model.pkl')
X_final_obama_text = final_testData_no_label_Obama_tweets['tweet'].astype(str).tolist()
X_final_romney_text = final_testData_no_label_Romney_tweets['tweet'].astype(str).tolist()
X_final_obama = model.encode(X_final_obama_text, show_progress_bar=True)
X_final_romney = model.encode(X_final_romney_text, show_progress_bar=True)
predictions_obama = classifier.predict(X_final_obama)
predictions_romney = classifier.predict(X_final_romney)

Batches: 100%|██████████| 61/61 [00:06<00:00,  9.12it/s]
Batches: 100%|██████████| 60/60 [00:06<00:00,  9.67it/s]
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b


In [27]:
with open('final_obama_predictions.txt', 'w') as f:
    f.write("(setf x *(\n")
    for tweet_num, sentiment in zip(final_testData_no_label_Obama_tweets['tweet_number'], predictions_obama):
        f.write(f"({tweet_num} {int(sentiment)})\n")
    f.write("))")

with open('final_romney_predictions.txt', 'w') as f:
    f.write("(setf x *(\n")
    for tweet_num, sentiment in zip(final_testData_no_label_Romney_tweets['tweet_number'], predictions_romney):
        f.write(f"({tweet_num} {int(sentiment)})\n")
    f.write("))")

In [26]:
print("count of -1, 0, 1 in Obama final test predictions: ", 
      (list(predictions_obama).count(-1), 
       list(predictions_obama).count(0), 
       list(predictions_obama).count(1)))

print("count of -1, 0, 1 in Romney final test predictions: ", 
      (list(predictions_romney).count(-1), 
       list(predictions_romney).count(0), 
       list(predictions_romney).count(1)))

count of -1, 0, 1 in Obama final test predictions:  (604, 753, 594)
count of -1, 0, 1 in Romney final test predictions:  (870, 590, 440)
