<a href="https://colab.research.google.com/github/Sara-Samer/sentiment-analsis-with-TF-IDF/blob/main/Sentiment_analsis_with_TF_IDF_and_logistic_regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data manipulation

## Google sheet setup
---
To work with google sheet directly without downloading the dataset  


google sheet link *(please click on this link first then run the notebook)*: [sheet](https://docs.google.com/spreadsheets/d/1rrlGMui0b2Xniac-YjUtnpq3KapdkXJtFaWEdkdtCNI/edit?usp=sharing)

In [None]:
from google.colab import auth
auth.authenticate_user()

In [None]:
import gspread
from oauth2client.client import GoogleCredentials

gc = gspread.authorize(GoogleCredentials.get_application_default())
worksheet = gc.open('2k_reviews').sheet1

# get_all_values gives a list of rows.
rows = worksheet.get_all_values() # we will use this later is importing data

## imports

In [None]:
!pip install contractions

Collecting contractions
  Downloading https://files.pythonhosted.org/packages/11/4d/378ab91284c2c3a06ab475b287721c09b7951d5ecb3edf4ffb0e1e7a568a/contractions-0.0.49-py2.py3-none-any.whl
Collecting textsearch>=0.0.21
  Downloading https://files.pythonhosted.org/packages/d3/fe/021d7d76961b5ceb9f8d022c4138461d83beff36c3938dc424586085e559/textsearch-0.0.21-py2.py3-none-any.whl
Collecting anyascii
[?25l  Downloading https://files.pythonhosted.org/packages/a3/14/666cd44bf53f36a961544af592cb5c5c800013f9c51a4745af8d7c17362a/anyascii-0.2.0-py3-none-any.whl (283kB)
[K     |████████████████████████████████| 286kB 2.8MB/s 
[?25hCollecting pyahocorasick
[?25l  Downloading https://files.pythonhosted.org/packages/7f/c2/eae730037ae1cbbfaa229d27030d1d5e34a1e41114b21447d1202ae9c220/pyahocorasick-1.4.2.tar.gz (321kB)
[K     |████████████████████████████████| 327kB 32.1MB/s 
[?25hBuilding wheels for collected packages: pyahocorasick
  Building wheel for pyahocorasick (setup.py) ... [?25l[?25hdone


In [None]:
import pandas as pd
import re
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from bs4 import BeautifulSoup
import contractions
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Import data

In [None]:
df = pd.DataFrame.from_records(rows[1:], columns=['review', 'sentiment'])
df.head()

Unnamed: 0,review,sentiment
0,"the year is barely a week old , and there is a...",0
1,those of you who frequently read my reviews ar...,1
2,the promotion for fear and loathing in las veg...,1
3,"synopsis : original "" jurassic park "" survivor...",0
4,it stands as a moment one will not soon forget...,1


# TF-IDF

## TF-IDF analyzer

In [None]:
def analyzer(text):
    # Clean HTML tags
    filtered_text = BeautifulSoup(text).get_text()
    # Expand contractions
    filtered_text = contractions.fix(filtered_text)
    # Remove punctuation
    # filtered_text = re.sub("[^a-zA-Z]"," ", filtered_text)
    filtered_text = re.sub(r'/', ' ', filtered_text)
    filtered_text = re.sub(r'[^\w\s]', '', filtered_text)
    # Tokenize
    words = filtered_text.lower().split()
    # Remove stopwords
    stops = set(stopwords.words("english"))
    words = [w for w in words if not w in stops]
    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(w, pos="v") for w in words]
    words = [lemmatizer.lemmatize(w, pos="n") for w in words]
    # Stemming
    porter_stemmer = PorterStemmer()
    words = [porter_stemmer.stem(w) for w in words]
    return words


In [None]:
analyzer("He'd , a young child, was running/walking and/or eating at same time -bad comment at the time :)-. He has bad habit of swimming after playing long hours in the Sun.")

['would',
 'young',
 'child',
 'run',
 'walk',
 'eat',
 'time',
 'bad',
 'comment',
 'time',
 'bad',
 'habit',
 'swim',
 'play',
 'long',
 'hour',
 'sun']

## TF-IDF vectorizer

In [None]:
vectorizer = TfidfVectorizer(analyzer=analyzer) 
tfidf = vectorizer.fit_transform(df['text']) 
y =  df['sentiment']

# Regression model

## test and train divide

In [None]:
X_train,X_test,y_train,y_test = train_test_split(tfidf,y,test_size=0.3,random_state=0)

## Logistic regression model

In [None]:
model = LogisticRegressionCV(cv=5, scoring='accuracy', random_state=0, n_jobs=-1, verbose=3, max_iter=300).fit(X_train,y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   13.5s finished


# Test

In [None]:
model.score(X_test,y_test)

0.8283333333333334

In [None]:
#to predict
query = ["Worst movie I have ever seen in my entire life",
         "greatest plot ever, it was a master piece",
         "The movie was meh, not much to be a fan of",
         "I liked it a lot, the hero was smart and the female lead was amazing"]

features = vectorizer.transform(query)
model.predict(features)

array([0, 1, 0, 1])