In [127]:
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
import re
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
from transformers import pipeline

In [42]:
df = pd.read_csv('twitter_new.csv', encoding='iso-8859-1',names=['sentiment','id','time','query','name','tweet'])

In [43]:
df.head()

Unnamed: 0,sentiment,id,time,query,name,tweet
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [44]:
df=df[['sentiment','tweet']]

In [45]:
df.head()

Unnamed: 0,sentiment,tweet
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


In [46]:
df['tweet']=df['tweet'].str.lower()

In [47]:
df.head()

Unnamed: 0,sentiment,tweet
0,0,"@switchfoot http://twitpic.com/2y1zl - awww, t..."
1,0,is upset that he can't update his facebook by ...
2,0,@kenichan i dived many times for the ball. man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


In [63]:
pos=df[df['sentiment']==0][:1000]
neg=df[df['sentiment']==4][:1000]

In [64]:
df=pd.concat([pos,neg],axis=0)

In [65]:
df

Unnamed: 0,sentiment,tweet,tag
0,0,"@switchfoot http://twitpic.com/2y1zl - awww, t...","[@, switchfoot, http, :, //twitpic.com/2y1zl, ..."
1,0,is upset that he can't update his facebook by ...,"[is, upset, that, he, ca, n't, update, his, fa..."
2,0,@kenichan i dived many times for the ball. man...,"[@, kenichan, i, dived, many, times, for, the,..."
3,0,my whole body feels itchy and like its on fire,"[my, whole, body, feels, itchy, and, like, its..."
4,0,"@nationwideclass no, it's not behaving at all....","[@, nationwideclass, no, ,, it, 's, not, behav..."
...,...,...,...
800995,4,i have this strange desire to go to confession...,"[i, have, this, strange, desire, to, go, to, c..."
800996,4,@i_reporter answer sent in dm. try it,"[@, i_reporter, answer, sent, in, dm, ., try, it]"
800997,4,@brooklynunion cuz ur 3pm is my 9am and id be ...,"[@, brooklynunion, cuz, ur, 3pm, is, my, 9am, ..."
800998,4,@littrellfans its all good. just figured you w...,"[@, littrellfans, its, all, good, ., just, fig..."


In [74]:
def word_tokenization(text):
    token= word_tokenize(text)
    stop_word=set(stopwords.words('english'))
    filter_word=[word for word in token if word not in stop_word]
    con_str=" ".join(filter_word)
    return con_str

In [75]:
df['tag']=df['tweet'].apply(word_tokenization)

In [76]:
df['tag'][2]

'@ kenichan dived many times ball . managed save 50 % rest go bounds'

In [77]:
df.head()

Unnamed: 0,sentiment,tweet,tag
0,0,"@switchfoot http://twitpic.com/2y1zl - awww, t...",@ switchfoot http : //twitpic.com/2y1zl - awww...
1,0,is upset that he can't update his facebook by ...,upset ca n't update facebook texting ... might...
2,0,@kenichan i dived many times for the ball. man...,@ kenichan dived many times ball . managed sav...
3,0,my whole body feels itchy and like its on fire,whole body feels itchy like fire
4,0,"@nationwideclass no, it's not behaving at all....","@ nationwideclass , 's behaving . 'm mad . ? c..."


In [78]:
def punctuation_removal(text):
    text= text.translate(str.maketrans('','',string.punctuation))
    return text

In [79]:
df['tag']=df['tag'].apply(punctuation_removal)

In [80]:
df.head()

Unnamed: 0,sentiment,tweet,tag
0,0,"@switchfoot http://twitpic.com/2y1zl - awww, t...",switchfoot http twitpiccom2y1zl awww s bum...
1,0,is upset that he can't update his facebook by ...,upset ca nt update facebook texting might cry...
2,0,@kenichan i dived many times for the ball. man...,kenichan dived many times ball managed save ...
3,0,my whole body feels itchy and like its on fire,whole body feels itchy like fire
4,0,"@nationwideclass no, it's not behaving at all....",nationwideclass s behaving m mad ca nt see


In [111]:
def lemmetization(text):
    words = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    word= ' '.join(lemmatized_words)
    word=re.sub(r'<.*?>', '', word)
    return word

In [112]:
df['tag']=df['tag'].apply(lemmetization)

In [114]:
df[['tag','sentiment']]

Unnamed: 0,tag,sentiment
0,switchfoot http twitpiccom2y1zl awww s bummer ...,0
1,upset ca nt update facebook texting might cry ...,0
2,kenichan dived many time ball managed save 50 ...,0
3,whole body feel itchy like fire,0
4,nationwideclass s behaving m mad ca nt see,0
...,...,...
800995,strange desire go confession,4
800996,ireporter answer sent dm try,4
800997,brooklynunion cuz ur 3pm 9am id either asleep ...,4
800998,littrellfans good figured would like know,4


In [165]:
x=df['tag']
y=df['sentiment']
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)

In [166]:
tfidf_vectorizer = TfidfVectorizer(max_features=10000)
X_train_tfidf = tfidf_vectorizer.fit_transform(x_train)
X_test_tfidf = tfidf_vectorizer.transform(x_test)

In [169]:
classifier = MultinomialNB()
classifier.fit(X_train_tfidf, y_train)

In [170]:
y_pred = classifier.predict(X_test_tfidf)


In [171]:
classification_report(y_test, y_pred)

'              precision    recall  f1-score   support\n\n           0       0.65      0.69      0.67       201\n           4       0.66      0.63      0.65       199\n\n    accuracy                           0.66       400\n   macro avg       0.66      0.66      0.66       400\nweighted avg       0.66      0.66      0.66       400\n'

### Large Language Model

In [32]:
sentiment_analyzer = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")

Downloading (…)lve/main/config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

In [57]:
tweets = [

]

results = sentiment_analyzer(tweets)
def sentiment(dic):
    if dic['label']== 'NEGATIVE':
        return 'Negative'
    elif dic['label']== 'POSITIVE':
        return 'Positive'
    else:
        return 'Neutral'
for i in results:
    sentiment=sentiment(i)
    print(sentiment)


Negative
