In [29]:
#link google drive
!pip install -U -q PyDrive 
  
from pydrive.auth import GoogleAuth 
from pydrive.drive import GoogleDrive 
from google.colab import auth 
from oauth2client.client import GoogleCredentials   

# Authenticate and create the PyDrive client. 
auth.authenticate_user() 
gauth = GoogleAuth() 
gauth.credentials = GoogleCredentials.get_application_default() 
drive = GoogleDrive(gauth)

In [30]:
#get the csv file from google drive
link = 'https://drive.google.com/file/d/1dTIWNpjlrnTQBIQtaGOh0jCRYZiAQO79/view'

# to get the id part of the file 
id = link.split("/")[-2] 

downloaded = drive.CreateFile({'id':id})  
downloaded.GetContentFile('SentimentTweets.csv')

In [31]:
#read from csv and store data
import pandas as pd 

df = pd.read_csv("SentimentTweets.csv",
                header = None, encoding = "ISO-8859-1",
                names=['val','target', 'id', 'date', 'flag', 'user', 'text'],
                dtype='unicode')

In [32]:
df1 = df.sample(n=10000)
features = df1['text'].values
labels = df1['target'].values
#features = df['text'].values
#labels = df['target'].values

In [33]:
import re
import nltk 
import spacy
import en_core_web_sm
from nltk.tokenize import word_tokenize
nltk.download('punkt')

sp = spacy.load('en_core_web_sm')
all_stopwords = sp.Defaults.stop_words

processed_features = []

# load en_core_web_sm of English for vocabluary, syntax & entities
nlp = en_core_web_sm.load()

def sentence_lemmatization(sentence):
    lemma_words = []
    for word in sentence:
        lemma_words.append(word.lemma_)
    return ' '.join(lemma_words)

def remove_stopwords(senctence):
    text_tokens = word_tokenize(str(features[sentence]))
    filtered_words = [word for word in text_tokens if not word in all_stopwords]
    return ' '.join(filtered_words)

for sentence in range(0, len(features)):
    # Remove stopwords
    processed_feature = remove_stopwords(str(features[sentence]))

    # "nlp" Object is used to create documents with linguistic annotations.
    my_doc = nlp(processed_feature)

    # Use spacy to lemmalize the sentences
    processed_feature = sentence_lemmatization(my_doc)

    # Remove urls
    processed_feature = re.sub(r'http\S+|www\S+|https\S+', '', processed_feature)
 
    # Remove all the special characters
    processed_feature = re.sub(r'\W', ' ', processed_feature)

    # remove all single characters
    processed_feature= re.sub(r'\s+[a-zA-Z]\s+', ' ', processed_feature)

    # Remove single characters from the start
    processed_feature = re.sub(r'\^[a-zA-Z]\s+', ' ', processed_feature) 

    # Substituting multiple spaces with single space
    processed_feature = re.sub(r'\s+', ' ', processed_feature, flags=re.I)

    # Removing prefixed 'b'
    processed_feature = re.sub(r'^b\s+', '', processed_feature)

    # Converting to Lowercase
    processed_feature = processed_feature.lower()

    processed_features.append(processed_feature)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [34]:
#Use TfidfVectorizer to vectorize data
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
nltk.download('stopwords')

vectorizer = TfidfVectorizer (max_df=1.0, min_df=1, max_features=2000)
processed_features = vectorizer.fit_transform(processed_features).toarray()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [35]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(processed_features, labels, test_size=0.2, random_state=4)

In [36]:
#Model Development
from sklearn.linear_model import LogisticRegression

# all parameters not specified are set to their defaults
# default solver is incredibly slow thats why we change it
logreg = LogisticRegression(solver = 'lbfgs')

logreg.fit(X_train, y_train)
# make predictions on the testing set
y_pred = logreg.predict(X_test)

In [37]:
#Evaluation
from sklearn.metrics import classification_report, accuracy_score
print(classification_report(y_test,y_pred))
print("Accuracy score:",accuracy_score(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.76      0.73      0.75      1023
           4       0.73      0.76      0.74       977

    accuracy                           0.74      2000
   macro avg       0.74      0.74      0.74      2000
weighted avg       0.75      0.74      0.74      2000

Accuracy score: 0.7445
