# Text Analysis

## Libraries

In [1]:
## General
import numpy as np
import pandas as pd
import re
import pickle

## Text Analysis
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
from textblob import TextBlob

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Sergio\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


ModuleNotFoundError: No module named 'textblob'

## Data Preprocessing

In [39]:
## Reading in data
with open('data/movies_text.pkl', 'rb') as fp:
    data = pickle.load(fp)

In [40]:
data.iloc[1][5]

"['Action', 'Sci-Fi']"

In [41]:
## Cleaning data for text preprocessing

### Cleaning helper function


temp = data.actors[0:10].copy()

actors = list(data.actors)

for elem in temp:
    actors[actors.index(elem)] = re.sub("\[|\]|\'| ", '', elem).split(',')


In [42]:

new_data = data.copy().drop('Unnamed: 0', axis = 1)

new_data.genres = new_data.genres.fillna('None')

def cleaner(x):
    to_replace = "\[|\]|\'|:|directors|\"| "

    x.actors = set(re.sub(to_replace, '', x.actors).lower().split(','))
    x.genres = set(re.sub(to_replace, '', x.genres).lower().split(','))
    
    ## Text processing

    ### Tokenizing and removing stopwords in description
    processed_desc = [word for word in word_tokenize(x.description) if word not in stopwords.words('english')]
    
    ### Lemmatizing words in processed description
    lem = WordNetLemmatizer()
    processed_desc = [lem.lemmatize(word) for word in processed_desc]

    ### removing punctuation and converting back into a string
    processed_desc = ' '.join([word for word in processed_desc if word.isalnum()])

    ## Adding processed descriptions to row
    processed_desc_s = pd.Series([processed_desc], index = ['processed_desc'])
    x = x.append(processed_desc_s)
    
    return x

movies = new_data.apply(cleaner, axis = 1)



## Data Modelling

### Sentiment Analysis
Using `textblob` for sentiment analysis.

In [6]:
## Adding sentiment polarity scores as a column to dataset
movies['desc_polarity'] = movies.processed_desc.apply(lambda x: TextBlob(x).sentiment.polarity)

### Document Similarity
Using `sklearn` and `tf-idf` to measure similarity between descriptions

In [35]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

def tf_idf_processor(text1, text2):
    vectorizer = TfidfVectorizer()
    documents = [text1, text2]

    results = vectorizer.fit_transform(documents)

    similarity = cosine_similarity(results[0:1], results[1:]).flatten()

    return similarity[0]


def dataframe_processor(df=):
    m = df.movie

    cos_sim_df = pd.DataFrame()

    for val in df.values:
        similarities = [tf_idf_processor(val[9], x[9]) for x in df.values]
        series = pd.Series(similarities)
        cos_sim_df = cos_sim_df.append(series, ignore_index = True)
    
    cos_sim_df.columns = m
    cos_sim_df.index = m
    cos_sim_df = cos_sim_df.reset_index(drop = False).rename_axis(None)
    
    return cos_sim_df

In [24]:
tf_idf_processor(movies.iloc[0]['processed_desc'], movies.iloc[1]['processed_desc'])

0.046043606474267834