In [1]:
import sys
import os

# add project root to sys.path
sys.path.append(os.path.abspath(".."))
from src.preprocess_data import preprocess_data

In [2]:
import os
print(os.getcwd())


d:\DataScience\Case Study\Elevate labs\Final_project\data\notebook


In [None]:
import pandas as pd
import joblib
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from scipy.sparse import hstack


# 1. Load training data
train_df = pd.read_csv("D:/DataScience/Case Study/Elevate labs/Final_project/data/twitter_training.csv", header=None,
                       names=["id", "account_name", "sentiment_type", "message"])


# 2. Preprocess
train_df = preprocess_data(train_df)


# 3. Encode target
le = LabelEncoder()
train_df["sentiment_encoded"] = le.fit_transform(train_df["sentiment_type"])


# 4. One-hot encode account_name
train_df = pd.get_dummies(train_df, columns=["account_name"], prefix="acc")
meta_col=train_df.iloc[:,:4]
dummy_col=train_df.iloc[:,4:].astype(int)
train_df=pd.concat([meta_col,dummy_col],axis=1)
train_df=train_df.drop(columns=["sentiment_type","message"],axis=1)

# 5. TF-IDF features
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_text = tfidf.fit_transform(train_df["message_clean"])
X_acc = train_df.filter(like="acc_")

X_final = hstack([X_text, X_acc.values])
y = train_df["sentiment_encoded"]


# 6. Train model
clf = LogisticRegression(max_iter=300,n_jobs=-1)
clf.fit(X_final, y)


# 7. Save artifacts
joblib.dump(clf, "D:/DataScience/Case Study/Elevate labs/Final_project/data/models/sentiment_model.pkl")
joblib.dump(tfidf, "D:/DataScience/Case Study/Elevate labs/Final_project/data/models/tfidf_vectorizer.pkl")
joblib.dump(le, "D:/DataScience/Case Study/Elevate labs/Final_project/data/models/label_encoder.pkl")
# Save account_name dummy columns
dummy_columns = train_df.filter(like="acc_").columns.tolist()
joblib.dump(dummy_columns, "D:/DataScience/Case Study/Elevate labs/Final_project/data/models/dummy_columns.pkl")
print("✅ Model training complete. Artifacts saved in models/")


✅ Model training complete. Artifacts saved in models/


In [4]:
train_df

Unnamed: 0,id,message_clean,sentiment_encoded,acc_Amazon,acc_ApexLegends,acc_AssassinsCreed,acc_Battlefield,acc_Borderlands,acc_CS-GO,acc_CallOfDuty,...,acc_Overwatch,acc_PlayStation5(PS5),acc_PlayerUnknownsBattlegrounds(PUBG),acc_RedDeadRedemption(RDR),acc_TomClancysGhostRecon,acc_TomClancysRainbowSix,acc_Verizon,acc_WorldOfCraft,acc_Xbox(Xseries),acc_johnson&johnson
0,2401,im getting on borderlands and i will murder yo...,3,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2401,i am coming to the borders and i will kill you...,3,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2401,im getting on borderlands and i will kill you all,3,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2401,im coming on borderlands and i will murder you...,3,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2401,im getting on borderlands 2 and i will murder ...,3,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74677,9200,just realized that the windows partition of my...,3,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
74678,9200,just realized that my mac window partition is ...,3,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
74679,9200,just realized the windows partition of my mac ...,3,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
74680,9200,just realized between the windows partition of...,3,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
X_acc

Unnamed: 0,acc_Amazon,acc_ApexLegends,acc_AssassinsCreed,acc_Battlefield,acc_Borderlands,acc_CS-GO,acc_CallOfDuty,acc_CallOfDutyBlackopsColdWar,acc_Cyberpunk2077,acc_Dota2,...,acc_Overwatch,acc_PlayStation5(PS5),acc_PlayerUnknownsBattlegrounds(PUBG),acc_RedDeadRedemption(RDR),acc_TomClancysGhostRecon,acc_TomClancysRainbowSix,acc_Verizon,acc_WorldOfCraft,acc_Xbox(Xseries),acc_johnson&johnson
0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74677,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
74678,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
74679,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
74680,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
print(X_final) 

<COOrdinate sparse matrix of dtype 'float64'
	with 1325643 stored elements and shape (73996, 5032)>
  Coords	Values
  (0, 2020)	0.19918564379168205
  (0, 1667)	0.2491470483526359
  (0, 3006)	0.14439970070965646
  (0, 615)	0.24592504208139573
  (0, 232)	0.11319361344755283
  (0, 4797)	0.20531095320555734
  (0, 2768)	0.3892302201335609
  (0, 4928)	0.14952511962260956
  (0, 166)	0.17802604671375613
  (0, 2023)	0.40068940814954823
  (0, 616)	0.39131783836950085
  (0, 321)	0.362171008167613
  (0, 4929)	0.33399753820460326
  (1, 232)	0.12479291789233574
  (1, 4797)	0.22634981025364748
  (1, 4928)	0.16484742740854172
  (1, 166)	0.19626893385235597
  (1, 321)	0.39928380682172226
  (1, 4929)	0.3682233130645803
  (1, 199)	0.2605097060363544
  (1, 882)	0.32720345624876335
  (1, 4317)	0.12150126238720697
  (1, 4031)	0.10476071959815704
  (1, 2357)	0.33571103647553363
  (1, 884)	0.4322068794063802
  :	:
  (73971, 5021)	1.0
  (73972, 5021)	1.0
  (73973, 5021)	1.0
  (73974, 5021)	1.0
  (73975, 5021)	

In [7]:
dummy_columns

['acc_Amazon',
 'acc_ApexLegends',
 'acc_AssassinsCreed',
 'acc_Battlefield',
 'acc_Borderlands',
 'acc_CS-GO',
 'acc_CallOfDuty',
 'acc_CallOfDutyBlackopsColdWar',
 'acc_Cyberpunk2077',
 'acc_Dota2',
 'acc_FIFA',
 'acc_Facebook',
 'acc_Fortnite',
 'acc_Google',
 'acc_GrandTheftAuto(GTA)',
 'acc_Hearthstone',
 'acc_HomeDepot',
 'acc_LeagueOfLegends',
 'acc_MaddenNFL',
 'acc_Microsoft',
 'acc_NBA2K',
 'acc_Nvidia',
 'acc_Overwatch',
 'acc_PlayStation5(PS5)',
 'acc_PlayerUnknownsBattlegrounds(PUBG)',
 'acc_RedDeadRedemption(RDR)',
 'acc_TomClancysGhostRecon',
 'acc_TomClancysRainbowSix',
 'acc_Verizon',
 'acc_WorldOfCraft',
 'acc_Xbox(Xseries)',
 'acc_johnson&johnson']