In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [2]:
# Download necessary NLTK data
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')
# nltk.download('stopwords')

[nltk_data] Downloading package wordnet to /home/vipul/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/vipul/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to /home/vipul/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
# Initialize the WordNet Lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [3]:
# Function to lemmatize text and remove stopwords
def lemmatize_and_remove_stopwords(text):
    tokens = word_tokenize(text)
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens if token.lower() not in stop_words]
    return ' '.join(lemmatized_tokens)

In [4]:
# Initialize an empty list to hold the processed chunks
filtered_chunks = []

In [5]:

df = pd.read_csv('/home/vipul/Project/Sentiment Analysis/sentiment_datasets/training.1600000.processed.noemoticon.csv', chunksize=10000)
print(df)

# for data in df: 
#     print(data.shape)

for chunk in df:
    chunk['filtered_tweet'] = chunk['text of the tweet�'].apply(lambda x: lemmatize_and_remove_stopwords(x))
    filtered_chunks.append(chunk)

<pandas.io.parsers.readers.TextFileReader object at 0x7927fe3afb60>


In [6]:
# filtered_chunks

In [7]:
df_filtered = pd.concat(filtered_chunks)
df_filtered


Unnamed: 0,polarity of tweet�,id of the tweet,date of the tweet,query,user,text of the tweet�,filtered_tweet
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,upset ca n't update Facebook texting ... might...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,@ Kenichan dived many time ball . Managed save...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,whole body feel itchy like fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....","@ nationwideclass , 's behaving . 'm mad . ? c..."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew,@ Kwesidei whole crew
...,...,...,...,...,...,...,...
1048567,4,1960186342,Fri May 29 07:33:44 PDT 2009,NO_QUERY,Madelinedugganx,My GrandMa is making Dinenr with my Mum,GrandMa making Dinenr Mum
1048568,4,1960186409,Fri May 29 07:33:43 PDT 2009,NO_QUERY,OffRoad_Dude,Mid-morning snack time... A bowl of cheese noo...,Mid-morning snack time ... bowl cheese noodle ...
1048569,4,1960186429,Fri May 29 07:33:44 PDT 2009,NO_QUERY,Falchion,@ShaDeLa same here say it like from the Termi...,@ ShaDeLa say like Terminiator movie . come li...
1048570,4,1960186445,Fri May 29 07:33:44 PDT 2009,NO_QUERY,jonasobsessedx,@DestinyHope92 im great thaanks wbuu?,@ DestinyHope92 im great thaanks wbuu ?


In [8]:
df_filtered.rename(columns={"polarity of tweet�" : "sentiments", "filtered_tweet" : "tweet"}, inplace=True)
dataset = df_filtered[['sentiments', 'tweet']]
dataset

Unnamed: 0,sentiments,tweet
0,0,upset ca n't update Facebook texting ... might...
1,0,@ Kenichan dived many time ball . Managed save...
2,0,whole body feel itchy like fire
3,0,"@ nationwideclass , 's behaving . 'm mad . ? c..."
4,0,@ Kwesidei whole crew
...,...,...
1048567,4,GrandMa making Dinenr Mum
1048568,4,Mid-morning snack time ... bowl cheese noodle ...
1048569,4,@ ShaDeLa say like Terminiator movie . come li...
1048570,4,@ DestinyHope92 im great thaanks wbuu ?


In [9]:
dataset['sentiments'].unique()

array([0, 4])

In [10]:
df1 = pd.read_csv('/home/vipul/Project/Sentiment Analysis/sentiment_datasets/training.1600000.processed.noemoticon.csv')
df1

Unnamed: 0,polarity of tweet�,id of the tweet,date of the tweet,query,user,text of the tweet�
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew
...,...,...,...,...,...,...
1048567,4,1960186342,Fri May 29 07:33:44 PDT 2009,NO_QUERY,Madelinedugganx,My GrandMa is making Dinenr with my Mum
1048568,4,1960186409,Fri May 29 07:33:43 PDT 2009,NO_QUERY,OffRoad_Dude,Mid-morning snack time... A bowl of cheese noo...
1048569,4,1960186429,Fri May 29 07:33:44 PDT 2009,NO_QUERY,Falchion,@ShaDeLa same here say it like from the Termi...
1048570,4,1960186445,Fri May 29 07:33:44 PDT 2009,NO_QUERY,jonasobsessedx,@DestinyHope92 im great thaanks wbuu?


In [11]:
df1['polarity of tweet�'].value_counts()

polarity of tweet�
0    799996
4    248576
Name: count, dtype: int64

In [31]:
X = dataset['tweet'].tolist()

In [32]:
y = dataset['sentiments'].tolist()

In [33]:
X

["upset ca n't update Facebook texting ... might cry result School today also . Blah !",
 '@ Kenichan dived many time ball . Managed save 50 % rest go bound',
 'whole body feel itchy like fire',
 "@ nationwideclass , 's behaving . 'm mad . ? ca n't see .",
 '@ Kwesidei whole crew',
 'Need hug',
 "@ LOLTrish hey long time see ! Yes .. Rains bit , bit LOL , 'm fine thanks , 's ?",
 're-pierced ear',
 "@ caregiving could n't bear watch . thought UA loss embarrassing . . . . .",
 '@ octolinz16 count , idk either . never talk anymore',
 "@ smarrison would 've first , n't gun . really though , zac snyder 's doucheclown .",
 '@ iamjazzyfizzle wish got watch ! ! miss @ iamlilnicki premiere ? !',
 "Hollis ' death scene hurt severely watch film wry director cut ?",
 'file tax',
 '@ LettyA ahh ive always wanted see rent love soundtrack ! !',
 '@ FakerPattyPattz Oh dear . drinking forgotten table drink ?',
 "@ alydesigns day n't get much done",
 "one friend called , asked meet Mid Valley today ...

In [34]:
len(dataset['sentiments'])

1048572

In [35]:
len(dataset['tweet'])

1048572

In [36]:
y

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,


In [37]:
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [39]:
# print("X_train :", X_train.shape)
# print("y_train :", y_train.shape)
# print("X_test :", X_test.shape)
# print("y_test :", y_test.shape)

In [22]:
# Fit and transform the documents
# tfidf_matrix = vectorizer.fit_transform(dataset['tweet'].tolist())
# tfidf_matrix

In [40]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

In [41]:
# Fit and transform the training data (this will be a sparse matrix)
X_train_tfidf = vectorizer.fit_transform(X_train)

# Transform the test data (this will be a sparse matrix)
X_test_tfidf = vectorizer.transform(X_test)

In [43]:
# print("X_train_tfidf :", X_train_tfidf.shape)
# print("y_train :", y_train.shape)

In [44]:
X_train_tfidf

<734000x374316 sparse matrix of type '<class 'numpy.float64'>'
	with 5581036 stored elements in Compressed Sparse Row format>

In [45]:
# Initialize and train the Logistic Regression model (supports sparse matrices)
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [18]:
# # Get the feature names (terms)
# feature_names = vectorizer.get_feature_names_out()

# # Convert the TF-IDF sparse matrix to a dense matrix
# dense_matrix = tfidf_matrix.todense()

# # Convert the TF-IDF matrix to a DataFrame
# tf_matrix = pd.DataFrame(dense_matrix, columns=feature_names)

In [46]:
y_pred = model.predict(X_test_tfidf)

In [47]:
y_pred

array([4, 0, 0, ..., 0, 4, 0])

In [59]:
test = "feeling better for some reason"


In [60]:
inp = vectorizer.transform([test])

In [61]:
ans = model.predict(inp)
print(ans)

[0]
