In [1]:
import kaggle
from kaggle.api.kaggle_api_extended import KaggleApi

api = KaggleApi()

try:
    api.authenticate()
    # Let's try to list datasets instead of competitions 
    # (Sometimes competitions require you to accept rules on the website first)
    datasets = api.dataset_list(search='sentiment')
    print("✅ Full Authorization Successful!")
    print(f"Found {len(datasets)} datasets. Your connection is perfect.")
except Exception as e:
    print(f"❌ Still Unauthorized. Error: {e}")

✅ Full Authorization Successful!
Found 20 datasets. Your connection is perfect.


In [2]:
!kaggle datasets download -d kazanova/sentiment140

Dataset URL: https://www.kaggle.com/datasets/kazanova/sentiment140
License(s): other
Downloading sentiment140.zip to C:\Users\net\Desktop\MyNotebooks\Sentiment analysis




  0%|          | 0.00/80.9M [00:00<?, ?B/s]
 73%|#######2  | 59.0M/80.9M [00:00<00:00, 478MB/s]
100%|##########| 80.9M/80.9M [00:00<00:00, 462MB/s]


In [3]:
from zipfile import ZipFile

In [5]:
dataset = 'sentiment140.zip'
with ZipFile(dataset,'r') as zip:
    zip.extractall()
    print('The dataset is extracted')

The dataset is extracted


In [2]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer # Fixed the typo here
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


In [7]:
# Important: You also need to download the stopwords list if you haven't yet
nltk.download('stopwords')

print('Import successful!')

Import successful!


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\net\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
twitter_data = pd.read_csv('training.1600000.processed.noemoticon.csv',encoding = 'ISO-8859-1')


In [4]:
twitter_data.shape

(1599999, 6)

In [5]:
twitter_data.head()

Unnamed: 0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


In [6]:
column_names = ['target', 'id', 'date','flag', 'user', 'text']
twitter_data = pd.read_csv('training.1600000.processed.noemoticon.csv',names = column_names, encoding = 'ISO-8859-1')


In [14]:
twitter_data.shape

(1600000, 6)

In [7]:
twitter_data.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [8]:
#counting the number of missing values in the dataset
twitter_data.isnull().sum()

target    0
id        0
date      0
flag      0
user      0
text      0
dtype: int64

In [9]:
#checking the distribution of the targer=t column
twitter_data['target'].value_counts()

target
0    800000
4    800000
Name: count, dtype: int64

In [10]:
#convert target 4 to '1'
twitter_data.replace({'target':{4:1}}, inplace= True)

In [11]:
#checking the distribution of the targer=t column
twitter_data['target'].value_counts()

target
0    800000
1    800000
Name: count, dtype: int64

In [12]:
#stemming
#stemming is the process of reducing a word to its Root word 
#example : actor, actress, acting = act
port_stem = PorterStemmer()

In [13]:
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]', ' ',content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

In [14]:
twitter_data['stemmed_content'] = twitter_data['text'].apply(stemming)

In [15]:
twitter_data.head()

Unnamed: 0,target,id,date,flag,user,text,stemmed_content
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",switchfoot http twitpic com zl awww bummer sho...
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,upset updat facebook text might cri result sch...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,kenichan dive mani time ball manag save rest g...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,whole bodi feel itchi like fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....",nationwideclass behav mad see


In [16]:
print(twitter_data['stemmed_content'])

0          switchfoot http twitpic com zl awww bummer sho...
1          upset updat facebook text might cri result sch...
2          kenichan dive mani time ball manag save rest g...
3                            whole bodi feel itchi like fire
4                              nationwideclass behav mad see
                                 ...                        
1599995                           woke school best feel ever
1599996    thewdb com cool hear old walt interview http b...
1599997                         readi mojo makeov ask detail
1599998    happi th birthday boo alll time tupac amaru sh...
1599999    happi charitytuesday thenspcc sparkschar speak...
Name: stemmed_content, Length: 1600000, dtype: object


In [17]:
print(twitter_data['target'])

0          0
1          0
2          0
3          0
4          0
          ..
1599995    1
1599996    1
1599997    1
1599998    1
1599999    1
Name: target, Length: 1600000, dtype: int64


In [21]:
#separating the data and the label
x= twitter_data['stemmed_content']
y = twitter_data['target']

In [22]:
print(x)

0          switchfoot http twitpic com zl awww bummer sho...
1          upset updat facebook text might cri result sch...
2          kenichan dive mani time ball manag save rest g...
3                            whole bodi feel itchi like fire
4                              nationwideclass behav mad see
                                 ...                        
1599995                           woke school best feel ever
1599996    thewdb com cool hear old walt interview http b...
1599997                         readi mojo makeov ask detail
1599998    happi th birthday boo alll time tupac amaru sh...
1599999    happi charitytuesday thenspcc sparkschar speak...
Name: stemmed_content, Length: 1600000, dtype: object


In [23]:
print(y)

0          0
1          0
2          0
3          0
4          0
          ..
1599995    1
1599996    1
1599997    1
1599998    1
1599999    1
Name: target, Length: 1600000, dtype: int64


In [24]:
#splitting the data and test data
x_train, x_test, y_train, y_test = train_test_split(x,y , test_size= 0.2, stratify=y, random_state=2)

In [25]:
print(x.shape, x_train.shape, x_test.shape)

(1600000,) (1280000,) (320000,)


In [26]:
print(x_test)

131348     mmangen fine much time chat twitter hubbi back...
1142114         ah may show w ruth kim amp geoffrey sanhueza
244564                   ishatara mayb bay area thang dammit
445353                                 game end lost stinkyy
415893                                          cool brother
                                 ...                        
178459     twitter drive nut wont let download profil pic...
1515130                                    teamqivana welcom
1449952    destini nevertheless hooray member wonder safe...
441063                                             feel well
1583304                                    supersandro thank
Name: stemmed_content, Length: 320000, dtype: object


In [28]:
#converting the textuel data to numerical data
vectorizer = TfidfVectorizer()
x_train = vectorizer.fit_transform(x_train)
x_test = vectorizer.transform(x_test)

In [29]:
print(x_train)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 9453092 stored elements and shape (1280000, 461488)>
  Coords	Values
  (0, 436713)	0.27259876264838384
  (0, 354543)	0.3588091611460021
  (0, 185193)	0.5277679060576009
  (0, 109306)	0.3753708587402299
  (0, 235045)	0.41996827700291095
  (0, 443066)	0.4484755317023172
  (1, 160636)	1.0
  (2, 109306)	0.4591176413728317
  (2, 124484)	0.1892155960801415
  (2, 407301)	0.18709338684973031
  (2, 129411)	0.29074192727957143
  (2, 406399)	0.32105459490875526
  (2, 433560)	0.3296595898028565
  (2, 77929)	0.31284080750346344
  (2, 443430)	0.3348599670252845
  (2, 266729)	0.24123230668976975
  (2, 409143)	0.15169282335109835
  (2, 178061)	0.1619010109445149
  (2, 150715)	0.18803850583207948
  (2, 132311)	0.2028971570399794
  (2, 288470)	0.16786949597862733
  (3, 406399)	0.29029991238662284
  (3, 158711)	0.4456939372299574
  (3, 151770)	0.278559647704793
  (3, 56476)	0.5200465453608686
  :	:
  (1279996, 318303)	0.21254698865277744
  (12

In [30]:
print(x_test)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 2289192 stored elements and shape (320000, 461488)>
  Coords	Values
  (0, 15110)	0.1719352837797837
  (0, 31168)	0.1624772418052177
  (0, 67828)	0.26800375270827315
  (0, 106069)	0.36555450010904555
  (0, 132364)	0.255254889555786
  (0, 138164)	0.23688292264071406
  (0, 171378)	0.2805816206356074
  (0, 271016)	0.45356623916588285
  (0, 279082)	0.17825180109103442
  (0, 388348)	0.2198507607206174
  (0, 398906)	0.34910438732642673
  (0, 409143)	0.3143047059807971
  (0, 420984)	0.17915624523539805
  (1, 6463)	0.30733520460524466
  (1, 15110)	0.211037449588008
  (1, 145393)	0.575262969264869
  (1, 217562)	0.40288153995289894
  (1, 256777)	0.28751585696559306
  (1, 348135)	0.4739279595416274
  (1, 366203)	0.24595562404108307
  (2, 22532)	0.3532582957477176
  (2, 34401)	0.37916255084357414
  (2, 89448)	0.36340369428387626
  (2, 183312)	0.5892069252021465
  (2, 256834)	0.2564939661498776
  :	:
  (319994, 443794)	0.2782185641032538


In [31]:
#training the machine learning model 
#logistic Regression
model = LogisticRegression(max_iter = 1000)

In [32]:
model.fit(x_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [34]:
#Model Evaluation
#Accuracy score
#accuracy score on the training data
x_train_prediction = model.predict(x_train)
training_data_accuracy= accuracy_score(y_train, x_train_prediction)

In [35]:
print('accuracy score on the training data is : ', training_data_accuracy)

accuracy score on the training data is :  0.79871953125


In [36]:
x_test_prediction = model.predict(x_test)
test_data_accuracy = accuracy_score(y_test, x_test_prediction)

In [37]:
print('accuracy score on the testing data is : ', test_data_accuracy)

accuracy score on the testing data is :  0.77668125


In [38]:
#model accuracy is 77.66%

In [49]:
#saving the trained model

In [40]:
import pickle

In [41]:
filename = 'trained_model.sav'
pickle.dump(model, open(filename, 'wb'))

In [44]:
#Using the saved model for future predictions
#loading the saved model
loaded_model = pickle.load(open('trained_model.sav', 'rb'))

In [48]:
x_new = x_test[200]   # this is positional indexing
print(y_test.iloc[200])

prediction = model.predict(x_new)
print(prediction)
if (prediction[0] == 0):
    print('Negative Tweet')
else:
    print('positive Tweet')

1
[1]
positive Tweet


In [50]:
# Save the vectorizer so we can use the same "vocabulary" for Facebook
pickle.dump(vectorizer, open('vectorizer.sav', 'wb'))
print("Vectorizer saved successfully!")

Vectorizer saved successfully!


In [51]:
import pickle
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

# 1. Load the Model and the Vectorizer
loaded_model = pickle.load(open('trained_model.sav', 'rb'))
loaded_vectorizer = pickle.load(open('vectorizer.sav', 'rb'))

# 2. Load your Facebook Data
# Change 'fb_sentiment.csv' to your actual file name
fb_df = pd.read_csv('fb_sentiment.csv', encoding='ISO-8859-1')

# 3. Pre-process the Facebook comments
# (Make sure your 'stemming' function from earlier is defined in this script too!)
# Let's assume your Facebook column is called 'comment_text'
fb_df['clean_comment'] = fb_df['FBPost'].apply(stemming)

# 4. Transform the Facebook text using the LOADED vectorizer
# Use .transform(), NOT .fit_transform()
X_fb = loaded_vectorizer.transform(fb_df['clean_comment'])

# 5. Predict Sentiment
fb_predictions = loaded_model.predict(X_fb)

# 6. Add results to the dataframe
fb_df['sentiment_score'] = fb_predictions
fb_df['sentiment_label'] = fb_df['sentiment_score'].apply(lambda x: 'Positive' if x == 1 else 'Negative')

# 7. Save the results
fb_df.to_csv('facebook_sentiment_results.csv', index=False)
print("Analysis complete! Results saved to 'facebook_sentiment_results.csv'")

Analysis complete! Results saved to 'facebook_sentiment_results.csv'
