<a href="https://colab.research.google.com/github/Vishal152k/Vishal152k.github.io/blob/master/Sentiment_Analysis_Netflix.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Install and Import necessary Packages

In [None]:
!pip install psycopg2-binary
!pip install praw

In [None]:
import pandas as pd

# Matplot
import matplotlib.pyplot as plt
%matplotlib inline

# Scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import 

from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint

# Utility
import re
import numpy as np
import os
import time

# Praw
import praw
import datetime
import sqlalchemy
from sqlalchemy import create_engine
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import Column, Integer, String, DateTime, Float
from sqlalchemy.orm import sessionmaker

# Tweepy
import tweepy

##Sentiment Analysis Model

Define Network Parameters

In [None]:
#NETWORK PARAMETERS
TEXT_CLEANING_RE = "@\S+|https?:\S+|http?:\S+|[^A-Za-z0-9]+"

# KERAS
SEQUENCE_LENGTH = 300
EPOCHS = 10
BATCH_SIZE = 1024

# SENTIMENT
POSITIVE = "POSITIVE"
NEGATIVE = "NEGATIVE"
NEUTRAL = "NEUTRAL"
SENTIMENT_THRESHOLDS = (0.4, 0.7)

Define Helper Functions

In [None]:
def map_sentiment(label):
  decode_map = {0: "NEGATIVE", 4: "POSITIVE"}
  return decode_map[int(label)]

def clean(text, stem=False):
  text = re.sub(TEXT_CLEANING_RE, ' ', str(text).lower()).strip()
  tokens = []
  for token in text.split():
    tokens.append(token)
  return " ".join(tokens)

def preprocess_data():
  train = pd.read_csv('training.1600000.processed.noemoticon.csv',
                    encoding="ISO-8859-1",
                    names=["target", "ids", "date", "flag", "user", "text"]
                    )
  train = train.iloc[:,[0,5]]
  train.target = train.target.apply(lambda x:map_sentiment(x))
  train.text = train.text.apply(lambda x: clean(x))
  df_train, df_test = train_test_split(train, test_size=0.2, random_state=42)
  return df_train,df_test

def tokenize(df_train):
  tokenizer = Tokenizer()
  tokenizer.fit_on_texts(df_train.text)
  vocab_size = len(tokenizer.word_index)+1
  return tokenizer,vocab_size

def data2array(tokenizer):
  x_train = pad_sequences(tokenizer.texts_to_sequences(df_train.text),
                          maxlen=SEQUENCE_LENGTH)
  x_test = pad_sequences(tokenizer.texts_to_sequences(df_test.text),
                        maxlen=SEQUENCE_LENGTH)
  encoder = LabelEncoder()
  encoder.fit(df_train.target.tolist())
  y_train = encoder.transform(df_train.target.tolist())
  y_test = encoder.transform(df_test.target.tolist())
  y_train = y_train.reshape(-1,1)
  y_test = y_test.reshape(-1,1)
  return x_train, x_test, y_train, y_test

def get_callbacks(path):
  callback1 = ReduceLROnPlateau(monitor='val_loss',patience=3, cooldown=0)
  callback2 = EarlyStopping(monitor='val_accuracy', min_delta=1e-4, patience=5)
  callback3 = ModelCheckpoint(path,
                              monitor='val_loss',save_best_only=True)
  callbacks = [callback1,callback2,callback3]
  return callbacks

def score(model,x_test,y_test):
  score = model.evaluate(x_test,y_test,batch_size=BATCH_SIZE)
  print()
  print("ACCURACY:",score[1])
  print("LOSS:",score[0])

def plot_history(history):  #plot the trend in accuracy and loss
  acc = history.history['accuracy']
  val_acc = history.history['val_accuracy']
  loss = history.history['loss']
  val_loss = history.history['val_loss']
  epochs = range(len(acc))
  plt.plot(epochs,acc, 'b', label='Training accuracy')
  plt.plot(epochs,val_acc, 'r', label='Validation accuracy')
  plt.title('Training and Validation accuracy')
  plt.legend()
  plt.figure()
  plt.plot(epochs,loss, 'b', label='Training loss')
  plt.plot(epochs,val_loss, 'r', label='Validation loss')
  plt.title('Training and Validation loss')
  plt.legend()
  plt.show()

def decode_sentiment(score, include_neutral=True):
    if include_neutral:        
        label = NEUTRAL
        if score <= SENTIMENT_THRESHOLDS[0]:
            label = NEGATIVE
        elif score >= SENTIMENT_THRESHOLDS[1]:
            label = POSITIVE
        return label
    else:
        return NEGATIVE if score < 0.5 else POSITIVE

def predict(text, include_neutral=True):
  start_at = time.time()
  text = clean(text)
  x_test = pad_sequences(tokenizer.texts_to_sequences([text]),
                         maxlen=SEQUENCE_LENGTH)
  score = model.predict([x_test])[0]
  label = decode_sentiment(score, include_neutral=include_neutral)
  return {"label":label, "score": float(score),
          "elapsed_time": time.time()-start_at}


Load Training data and Preprocess it in the form which we can feed into our Model

In [None]:
!unzip training_data.zip

In [None]:
df_train, df_test = preprocess_data()

In [None]:
tokenizer,vocab_size = tokenize(df_train)

In [None]:
x_train,x_test,y_train,y_test = data2array(tokenizer)

Load our pretrained Model

In [None]:
model = keras.models.load_model('model_weights.h5')
model.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy'])

Train a new Model and provide path where you want model weights to be saved

In [None]:
save_weights = 'model_weights_2.h5'
callbacks = get_callbacks(save_weights)

In [None]:
history = model.fit(x_train,y_train,
                    batch_size=BATCH_SIZE,
                    epochs=EPOCHS,
                    validation_split= 0.1,
                    verbose=1,
                    callbacks=callbacks
                    )

In [None]:
model.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy'])

In [None]:
score(model,x_test,y_test)

In [None]:
plot_history(history)

##LIVE DEMONSTRATIO

In [None]:
normalised_predict_text("There's only sad news")

In [None]:
normalised_predct_text("Our team will bring some good news to you :)")

0.87


##Genre-wise Analysis

Define Helper Functions

In [None]:
def get_tweets(search,number,place):
  tweets = tweepy.Cursor(api.search,q=search+'--place:%s'%place,lang='en').items(number)
  return tweets

Authenticate twitter api

In [None]:
auth = tweepy.OAuthHandler('','')
auth.set_access_token('','')
api = tweepy.API(auth,wait_on_rate_limit=True)

In [None]:
countries = pd.read_excel('countries_with_place_ids.xlsx',dtype='str')

Collect tweets for movies in a specific genre for all countries mentioned in our Excel file

In [None]:
sci_fi = pd.read_excel('SciFiMovieList.xlsx')
genre = 'Sci-Fi'
movie_list = [i for i in sci_fi.iloc[:,0]]
number = 20

In [None]:
tweet_dict = []
for movie in movie_list:
  for i in range(len(countries)):
    tweets = get_tweets(movie,number,countries.place_id[i])
    tweet_dict.append({
        'movie':movie,
        'country':countries.Country[i],
        'tweets':tweets   
    }
                      )

In [None]:
movie_genre = []
country = []
tweet_content = []
creation_time = []
user_age = []

In [None]:
start = time.time()
while(i<len(tweet_dict)):
  tweets = tweet_dict[i]['tweets']
  for tweet in tweets:
    tweet_genre.append(genre)
    tweet_movie.append(tweet_dict[i]['movie'])

    tweet_country.append(tweet_dict[i]['country'])
    tweet_text.append(tweet.text)
    tweet_user_age.append(tweet.user.created_at)
  print(len(tweet_dict)-i-1,'left')
  i+=1
print(time.time()-start)

Add those collected tweets and determine their sentiments. This data is then added to a pandas dataframe and saved in an Excel spreadsheet

In [None]:
genre_data = pd.DataFrame()
genre_data['Genre'] = pd.Series(tweet_genre)
genre_data['Movie'] = pd.Series(tweet_movie)
genre_data['Country'] = pd.Series(tweet_country)
genre_data['Tweet'] = pd.Series(tweet_text)
genre_data['TwitterAge'] = pd.Series(tweet_user_age)

genre_data['Sentiment'] = genre_data.Tweet.apply(lambda x:predict(x)['label'])
genre_data['SentimentScore'] = genre_data.Tweet.apply(lambda x:predict(x)['score'])

In [None]:
genre_data

In [None]:
genre_data.to_excel('SciFi_data.xlsx',index=False)