In [1]:
from dataset import load_dataset, preprocess

from nltk import word_tokenize, sent_tokenize, ngrams
from nltk.lm import Laplace
from nltk.lm.preprocessing import padded_everygram_pipeline
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
import random

import pandas as pd
from collections import defaultdict

import matplotlib.pyplot as plt

In [2]:
dataset = load_dataset()
dataset = preprocess(dataset)

In [3]:
plots = list(dataset['Plot'])
plots_as_string = " ".join(plots)

In [4]:
n = 1
unigrams = ngrams(plots_as_string.split(), n)

In [5]:
y = dataset['Genre'].values
print(y.shape)
x = dataset['Plot'].values
print(x.shape)

In [6]:
(x_train, x_test, y_train, y_test) = train_test_split(x, y, test_size=0.4)
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

In [11]:
df1 = pd.DataFrame(x_train)
df1 = df1.rename(columns={0: 'plot'})

In [12]:
df2 = pd.DataFrame(y_train)
df2 = df2.rename(columns={0: 'genre'})
df_train = pd.concat([df1, df2], axis=1)

In [13]:
df_train.head

In [14]:
df3 = pd.DataFrame(x_test)
df3 = df3.rename(columns={0: 'plot'})
df4 = pd.DataFrame(y_test)
df4 = df2.rename(columns={0: 'genre'})
df_test = pd.concat([df3, df4], axis=1)
df_test.head()

In [15]:
nltk.download('stopwords')

# Itération 1

In [48]:
def generate_N_grams(text, ngram=1):
    words = [word for word in text.split(" ") if word.lower() not in set(stopwords.words('english'))]
    temp = zip(*[words[i:] for i in range(0, ngram)])
    ans = [' '.join(ngram) for ngram in temp]
    return ans

In [49]:
generate_N_grams("A really funny movie", 2)

In [50]:
defdict = {}

test = df_train.head(500)

for genre in test['genre'].values:
    if genre not in defdict:
        defdict[f'{genre}'] = defaultdict(int)
    for text in test[test['genre'] == genre]['plot']:
        for word in generate_N_grams(text, 3):
            defdict[f'{genre}'][word] += 1

In [51]:
dfall = {}
for genre in test['genre'].values:
    dfall[f'{genre}'] = pd.DataFrame(sorted(defdict[genre].items(),key=lambda x:x[1],reverse=True))

In [52]:
pd1=dfall["drama"][0][:5]
pd2=dfall["drama"][1][:5]

In [56]:
plt.figure(1,figsize=(8,4))
plt.bar(pd1,pd2, color ='green',
        width = 0.4)
plt.xlabel("Words in positive dataframe")
plt.ylabel("Count")
plt.title("Top 10 words in drama genre - UNIGRAM ANALYSIS")
plt.savefig("positive-unigram.png")
plt.show()

# Itération 2

In [17]:
defdict2 = {}

test = df_train.head(500)

for genre in test['genre'].values:
    if genre not in defdict2:
        defdict2[f'{genre}'] = defaultdict(int)
    for text in test[test['genre'] == genre]['plot']:
        for word in generate_N_grams(text,3):
            defdict2[f'{genre}'][word] += 1

In [18]:
dfall2 = {}
for genre in test['genre'].values:
    dfall2[f'{genre}'] = pd.DataFrame(sorted(defdict2[genre].items(), key=lambda x: x[1], reverse=True))

In [57]:
def show_result(genre):
    color=random.choice(['red', 'blue', 'green', 'yellow', 'black', 'white', 'pink'])
    pd1 = dfall2[genre][0][:5]
    pd2 = dfall2[genre][1][:5]
    plt.figure(1, figsize=(8, 4))
    plt.bar(pd1, pd2, color=color,
            width=0.4)
    plt.xlabel("Words in positive dataframe")
    plt.ylabel("Count")
    plt.title(f"Top 10 words in {genre} genre - TRIGRAM ANALYSIS")
    plt.savefig("positive-unigram.png")
    plt.show()

In [58]:
show_result("drama")

# Itération 3

In [38]:
def generate_N_grams(text, ngram=1):
    words = [word for word in text if word.lower() not in set(stopwords.words('english'))]
    temp = zip(*[words[i:] for i in range(0, ngram)])
    ans = [' '.join(ngram) for ngram in temp]
    return ans

In [43]:
defdict3 = {}

test = df_train.head(500)

for genre in test['genre'].values:
    if genre not in defdict3:
        defdict3[f'{genre}'] = defaultdict(int)
    for text in test[test['genre'] == genre]['plot']:
        text = word_tokenize(text)
        text = [word for word in text if len(word) > 1 and word != 'i']
        for word in generate_N_grams(text, 3):
            defdict3[f'{genre}'][word] += 1

In [44]:
dfall3 = {}
for genre in test['genre'].values:
    dfall3[f'{genre}'] = pd.DataFrame(sorted(defdict3[genre].items(), key=lambda x: x[1], reverse=True))

In [65]:
def show_result(genre):
    color = random.choice(['red', 'blue', 'green', 'yellow', 'black', 'white', 'pink'])
    pd1 = dfall3[genre][0][:5]
    pd2 = dfall3[genre][1][:5]
    plt.figure(1, figsize=(12, 4))
    plt.bar(pd1, pd2, color=color,
            width=0.4)
    plt.xlabel("Words in positive dataframe")
    plt.ylabel("Count")
    plt.title(f"Top 10 words in {genre} genre - TRIGRAM ANALYSIS")
    plt.savefig("positive-unigram.png")
    plt.show()

In [66]:
show_result("drama")

# With NLTK

In [4]:
plots_train = plots[:500]
train_strings = " ".join(plots_train)
train_strings = train_strings.lower()
train_tokens = [word_tokenize(s) for s in sent_tokenize(train_strings)]

plots_test = plots[100:]
test_strings = " ".join(plots_test)
test_strings = test_strings.lower()
test_tokens = [word_tokenize(s) for s in sent_tokenize(test_strings)]

In [5]:
n = 3
train, vocab = padded_everygram_pipeline(n, train_tokens)
lmLS = Laplace(n) # Laplace smoothing

In [6]:
lmLS.fit(train, vocab)

In [7]:
print(lmLS.vocab.lookup(["movie", "fun"]))

('movie', 'fun')


In [8]:
print(lmLS.score("fun", ["have"]))

8.894423196655697e-05


In [15]:
begin = 'the'
l = []
for i in range(5):
    l.append(f"{begin} {' '.join(lmLS.generate(10, text_seed=[begin]))}")

In [16]:
l

['the police for a six-month tour of south america and returns',
 'the brooding landscape : the gesticulating passengers in the adirondacks .',
 'the north , one of these rich people would take a',
 'the old man , made easier by the local butterflies ,',
 'the forgiveness of jack on his return , he lies unidentified']

In [10]:
print(lmLS.perplexity(train_tokens))
print(lmLS.perplexity(test_tokens))

11083.795764068143
11096.331932303725
