In [75]:
import re
import json
import numpy as np
import pandas as pd
import nltk

from sklearn.model_selection import train_test_split, GridSearchCV

# libraries for pipeline
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, StandardScaler, MinMaxScaler, RobustScaler, MaxAbsScaler
from sklearn.feature_selection import SelectKBest
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA

# Models to try
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [3]:
# load data into python
filename = 'data/tweets.txt'

raw = []
for line in open(filename, 'r'):
    raw.append(json.loads(line))

In [4]:
# move into a df
df = pd.DataFrame(raw)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3731 entries, 0 to 3730
Data columns (total 37 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   created_at                 3731 non-null   object 
 1   id                         3731 non-null   int64  
 2   id_str                     3731 non-null   object 
 3   text                       3731 non-null   object 
 4   source                     3731 non-null   object 
 5   truncated                  3731 non-null   bool   
 6   in_reply_to_status_id      1287 non-null   float64
 7   in_reply_to_status_id_str  1287 non-null   object 
 8   in_reply_to_user_id        1326 non-null   float64
 9   in_reply_to_user_id_str    1326 non-null   object 
 10  in_reply_to_screen_name    1326 non-null   object 
 11  user                       3731 non-null   object 
 12  geo                        0 non-null      object 
 13  coordinates                0 non-null      objec

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3731 entries, 0 to 3730
Data columns (total 37 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   created_at                 3731 non-null   object 
 1   id                         3731 non-null   int64  
 2   id_str                     3731 non-null   object 
 3   text                       3731 non-null   object 
 4   source                     3731 non-null   object 
 5   truncated                  3731 non-null   bool   
 6   in_reply_to_status_id      1287 non-null   float64
 7   in_reply_to_status_id_str  1287 non-null   object 
 8   in_reply_to_user_id        1326 non-null   float64
 9   in_reply_to_user_id_str    1326 non-null   object 
 10  in_reply_to_screen_name    1326 non-null   object 
 11  user                       3731 non-null   object 
 12  geo                        0 non-null      object 
 13  coordinates                0 non-null      objec

In [6]:
df['text'][10]

'@bpwerks @mollyfprince @benshapiro Actions have consequences :) For both men and women.'

In [7]:
# find sentiments in text
sentiment = []

for tweet in df["text"]:
    if ":)" in tweet:
        sentiment.append("positive")
    elif ":(" in tweet:
        sentiment.append("negative")
    else:
        sentiment.append("?")

In [8]:
df["sentiment"] = sentiment
df['sentiment'].value_counts()

positive    1984
?            928
negative     819
Name: sentiment, dtype: int64

In [60]:
# one hot encoding and concatening, remove unknown sentiment tweets
df = df[df.sentiment != "?"]
onehot = pd.get_dummies(df["sentiment"])
ranked_tweets = pd.concat([df["text"],onehot, df["sentiment"]], axis=1)


In [57]:
ranked_tweets

Unnamed: 0,text,negative,positive,sentiment
1,@Diceman27 @Streamboosts Dropped you a follow ...,0,1,positive
2,@sakura_addicted But that was today @ midnight...,1,0,negative
3,RT @brokenworld05: Promotion Time ⏳\n\nMentio...,0,1,positive
4,marnie gave me the pouch from her forgotten wo...,0,1,positive
6,RT @CriticalError09: I hate sex but also I am ...,0,1,positive
...,...,...,...,...
3725,RT @Coco_games____: Join the Metafang event!!\...,0,1,positive
3726,RT @Zoe2Freaky: Kinda new to nsfw twt \n\n💖22\...,0,1,positive
3728,RT @jongseongflirts: this photo sequence :( ht...,1,0,negative
3729,RT @NextLevelGorg: One hand on her waist :)\nO...,0,1,positive


In [61]:
labels = ranked_tweets["sentiment"]

In [62]:
# time to clean the text
processed_tweets = []

for tweet in ranked_tweets['text']:
    print(tweet)
    # Converting to Lowercase
    tweet = tweet.lower()

    # remove rt
    tweet = re.sub('rt', '', tweet)

    # Remove all the special characters
    tweet = re.sub('[^a-zA-Z\s]', '', tweet)
    
    # Remove all single characters
    tweet = re.sub(r'(?:^| )\w(?:$| )', ' ', tweet)

    # Substituting multiple spaces with single space
    tweet = re.sub(r'\s+', ' ', tweet)
    print(tweet)
    processed_tweets.append(tweet)

@Diceman27 @Streamboosts Dropped you a follow on Twitch. :)
diceman streamboosts dropped you follow on twitch 
@sakura_addicted But that was today @ midnight :(  they should still be here
sakuraaddicted but that was today midnight they should still be here
RT @brokenworld05: Promotion Time  ⏳

Mention your  Ids ... I'll Promote 
you :)
Note : First 40 ID'S ❣️

Follow me and retweet 🔄
I will fo…
 brokenworld promotion time mention your ids ill promote you note first ids follow me and retweet i will fo
marnie gave me the pouch from her forgotten world pre-order, and i had no clue what to do with it until yesterday :) https://t.co/2yUC1GXZHM
marnie gave me the pouch from her forgotten world preorder and had no clue what to do with it until yesterday httpstcoyucgxzhm
RT @CriticalError09: I hate sex but also I am very hot :)
 criticalerror hate sex but also am very hot 
smile dog :)
smile dog 
Promotion Time 😁 6pm

Mention your Ids..... I'll Promote you :)

Note : Unlimited ids 👍

Follow me

In [63]:
# import stopwords
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

In [64]:
# vectorize
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=2500, min_df=7, max_df=0.8, stop_words=stopwords.words('english'))
v_tweets = vectorizer.fit_transform(processed_tweets)

In [65]:
# train test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(v_tweets,labels,test_size=0.25)

In [82]:
# create baseline classifier
baseline_clf = LogisticRegression()

baseline_fit = baseline_clf.fit(X_train, y_train)
baseline_pred = baseline_fit.predict(X_test)



In [83]:
from sklearn.metrics import confusion_matrix, accuracy_score
confusion_matrix(y_test, baseline_pred)

array([[ 95, 124],
       [ 16, 466]], dtype=int64)

In [85]:
accuracy_score(y_test, baseline_pred)

0.8002853067047075

In [69]:
# Create pipeline

pipeline = Pipeline([('classifier', LogisticRegression())])

In [70]:
model = pipeline.fit(X_train, y_train)

In [72]:
model

In [73]:
param_grid = {
    'classifier': [SVC()],
}

In [76]:
grid = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1)

In [77]:
grid.fit(X_train, y_train)