In [1]:
%matplotlib inline
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import re

## Reading the data, preprocessing the text and extracting synopsis column

In [2]:
df = pd.read_csv("ANNreviews.csv")
df.head()

Unnamed: 0,Name2,Review,Synopsis,Good_points,Ratings,Name,Bad_points
0,GN 7,"\r\n,\r\nA Silent Voice reached its climax at ...","At last, Shoya has woken up. Racing to the bri...",Maintains A Silent Voice's usual acuity of ar...,"{'Overall': 'B+', 'Art': 'A-', 'Story': 'B-'}",A Silent Voice,"Feels a bit overlong in its resolutions, and ..."
1,DVD 1,"\r\n,\r\nTo fully and correctly appreciate 009...",In a world where the Cold War continues well p...,"Musical score, fan service.","{'Animation': 'B-', 'Overall (dub)': 'B', 'Mus...",009-1,"Some character designs are appallingly ugly, ..."
2,Sub.Blu-Ray,"\r\n,\r\nWhat happens when you take twenty-sev...",Haruto can't bring himself to accept that his ...,"Some great voice work and nice animation, tri...","{'Animation': 'B', 'Overall (sub)': 'C', 'Musi...",A Town Where You Live,Story comes off melodramatic without much emo...
3,GN 6,"\r\n,\r\nIt's always an interesting experience...","Seeing Shoko about to leap off her balcony, Sh...",Doubles down on the manga's terrific characte...,"{'Overall': 'A', 'Art': 'A-', 'Story': 'A'}",A Silent Voice,A couple dramatic points felt a little loosel...
4,GN 4,"\r\n,\r\nIt feels like A Silent Voice's densit...",Things continue to change little by little for...,Continues to demonstrate wondrously poignant ...,"{'Overall': 'A-', 'Art': 'B', 'Story': 'A'}",A Silent Voice,Some dramatic developments here could have us...


In [9]:
from nltk import word_tokenize

def preprocess_text(text):
    if not pd.isnull(text):
        text = re.sub(r'\n|\t|\r', r' ', text, flags = re.DOTALL) #removing \n, \t and \r
        text = re.sub(r'[\(\)\,\.]', r' ', text) # removing . , ( ) [ ]
        text = re.sub(r' +', r' ', text) #multiple spaces to one space
        text = text.lower()
        text = word_tokenize(text)   
    return text

In [10]:
# Preprocessing all text (except titles) form Anime News Network
columns = ["Synopsis", "Review" ,"Good_points", "Bad_points"]
df2 = df.copy()
for col in columns:
    df2[col] = df[col].map(preprocess_text)

In [11]:
df2.head()

Unnamed: 0,Name2,Review,Synopsis,Good_points,Ratings,Name,Bad_points
0,GN 7,"[a, silent, voice, reached, its, climax, at, t...","[at, last, shoya, has, woken, up, racing, to, ...","[maintains, a, silent, voice, 's, usual, acuit...","{'Overall': 'B+', 'Art': 'A-', 'Story': 'B-'}",A Silent Voice,"[feels, a, bit, overlong, in, its, resolutions..."
1,DVD 1,"[to, fully, and, correctly, appreciate, 009-1,...","[in, a, world, where, the, cold, war, continue...","[musical, score, fan, service]","{'Animation': 'B-', 'Overall (dub)': 'B', 'Mus...",009-1,"[some, character, designs, are, appallingly, u..."
2,Sub.Blu-Ray,"[what, happens, when, you, take, twenty-seven,...","[haruto, ca, n't, bring, himself, to, accept, ...","[some, great, voice, work, and, nice, animatio...","{'Animation': 'B', 'Overall (sub)': 'C', 'Musi...",A Town Where You Live,"[story, comes, off, melodramatic, without, muc..."
3,GN 6,"[it, 's, always, an, interesting, experience, ...","[seeing, shoko, about, to, leap, off, her, bal...","[doubles, down, on, the, manga, 's, terrific, ...","{'Overall': 'A', 'Art': 'A-', 'Story': 'A'}",A Silent Voice,"[a, couple, dramatic, points, felt, a, little,..."
4,GN 4,"[it, feels, like, a, silent, voice, 's, densit...","[things, continue, to, change, little, by, lit...","[continues, to, demonstrate, wondrously, poign...","{'Overall': 'A-', 'Art': 'B', 'Story': 'A'}",A Silent Voice,"[some, dramatic, developments, here, could, ha..."


In [13]:
synopsis = df2["Synopsis"].copy()

In [14]:
synopsis[:10]

0    [at, last, shoya, has, woken, up, racing, to, ...
1    [in, a, world, where, the, cold, war, continue...
2    [haruto, ca, n't, bring, himself, to, accept, ...
3    [seeing, shoko, about, to, leap, off, her, bal...
4    [things, continue, to, change, little, by, lit...
5    [it, 's, a, few, years, in, the, future, when,...
6    [six, short, stories, reveal, how, love, lette...
7    [summer, vacation, is, approaching, and, tomoh...
8    [six, years, ago, shoya, 's, bullying, of, his...
9    [years, ago, it, was, shoya, 's, bullying, tha...
Name: Synopsis, dtype: object

In [15]:
num_of_docs = len(synopsis)
num_of_docs

4174

## Counting occurrences

In [17]:
from collections import defaultdict

doc_count = defaultdict(int)

for syn in synopsis:
    updated = defaultdict(bool)
    for word in syn:
        if not updated[word]:
            doc_count[word] += 1
            updated[word] = True

In [21]:
doc_freq = {word: count/num_of_docs for word, count in doc_count.items()}

In [30]:
print(doc_count['fears—even'], ',', doc_freq['fears—even'])

1   0.00023957834211787255


## Flitering out least common words

In [55]:
min_freq = 0.02
word_list = [word for word in doc_freq.keys() if doc_freq[word] > min_freq]
min_freq*num_of_docs, len(word_list)

(83.48, 634)

In [56]:
word_list

['him',
 'getting',
 'classmates',
 'used',
 'country',
 'works',
 'blood',
 'more',
 'light',
 'himself',
 'who',
 'top',
 'path',
 'say',
 'by',
 'police',
 'becoming',
 'organization',
 'teenage',
 'begins',
 'mysterious',
 'looks',
 'missing',
 'very',
 'adventures',
 'body',
 'made',
 'behind',
 'due',
 'meet',
 'under',
 'rest',
 'just',
 'name',
 'three',
 'sister',
 'end',
 'does',
 'goes',
 'young',
 'appears',
 'secrets',
 'look',
 'sets',
 'falls',
 'big',
 'on',
 'left',
 'a',
 'close',
 'rescue',
 'race',
 'whom',
 'already',
 'survive',
 'former',
 'naturally',
 'challenge',
 '–',
 'decide',
 'has',
 'pretty',
 'side',
 'god',
 'save',
 'allies',
 'best',
 'deep',
 'were',
 'back',
 'original',
 'and',
 'younger',
 'attention',
 'anyone',
 'gets',
 'meets',
 'no',
 'turned',
 'dreams',
 'ship',
 'every',
 'mind',
 'set',
 'together',
 'actually',
 'course',
 'army',
 'girls',
 'being',
 'down',
 'should',
 'ca',
 'forces',
 'humanity',
 'force',
 'two',
 'stay',
 'cute',


In [60]:
filtered_synopsis = synopsis.map(lambda syn: [word for word in syn if word in word_list])

In [61]:
filtered_synopsis.head()

0    [at, last, has, up, to, the, where, he, and, t...
1    [in, a, world, where, the, war, continues, wel...
2    [ca, n't, bring, himself, to, that, his, with,...
3    [about, to, off, her, to, her, her, before, sh...
4    [things, continue, to, change, little, by, lit...
Name: Synopsis, dtype: object

In [63]:
synopsis.map(len).describe()

count    4174.000000
mean      134.417585
std        54.606106
min         4.000000
25%        98.000000
50%       130.000000
75%       158.000000
max       425.000000
Name: Synopsis, dtype: float64

In [62]:
filtered_synopsis.map(len).describe()

count    4174.000000
mean       91.074030
std        37.116884
min         3.000000
25%        67.000000
50%        88.000000
75%       108.000000
max       291.000000
Name: Synopsis, dtype: float64

## Calculating frequencies

In [66]:
pair_doc_count = defaultdict(int)

for syn in fitered_synopsis:
    updated = defaultdict(bool)
    for w1 in syn:
        for w2 in syn:
            if w1 < w2 and not updated[(w1,w2)]:
                pair_doc_count[(w1,w2)] += 1
                updated[(w1,w2)]= True

In [68]:
pair_doc_freq = {pair: count/num_of_docs for pair, count in pair_doc_count.items()}

In [71]:
len(pair_doc_count), len(pair_doc_freq)

(198222, 198222)

In [74]:
pairs = list(pair_doc_count.keys())

In [95]:
df_words = pd.DataFrame()

df_words['A'] = pd.Series(w1 for w1, w2 in pairs)
df_words['B'] = pd.Series(w2 for w1, w2 in pairs)
df_words['P(A and B)/(P(A)*P(B))'] = pd.Series(pair_doc_freq[(w1,w2)]/(doc_freq[w1]*doc_freq[w2]) for w1, w2 in pairs)
df_words['P(A)'] = pd.Series(doc_freq[w1] for w1, w2 in pairs)
df_words['P(B)'] = pd.Series(doc_freq[w2] for w1, w2 in pairs)
df_words['P(A|B)'] = pd.Series(pair_doc_freq[(w1,w2)]/doc_freq[w2] for w1, w2 in pairs)
df_words['P(B|A)'] = pd.Series(pair_doc_freq[(w1,w2)]/doc_freq[w1] for w1, w2 in pairs)
df_words['P(A and B)'] = pd.Series(pair_doc_freq[(w1,w2)] for w1, w2 in pairs)

In [96]:
df_words.head(5)

Unnamed: 0,A,B,P(A and B)/(P(A)*P(B)),P(A),P(B),P(A|B),P(B|A),P(A and B)
0,everyone,fight,0.787666,0.066124,0.068999,0.052083,0.054348,0.003594
1,fight,visit,0.945199,0.068999,0.022041,0.065217,0.020833,0.001437
2,also,through,1.070941,0.1931,0.084571,0.206799,0.090571,0.017489
3,ones,sent,1.302747,0.021322,0.025874,0.027778,0.033708,0.000719
4,for,help,1.046418,0.599185,0.134883,0.626998,0.141144,0.084571


In [100]:
df_words.sort_values(by = 'P(A and B)/(P(A)*P(B))', ascending= False)

Unnamed: 0,A,B,P(A and B)/(P(A)*P(B)),P(A),P(B),P(A|B),P(B|A),P(A and B)
170926,short,stories,13.565500,0.028749,0.023958,0.390000,0.325000,0.009344
47160,giant,robot,10.131068,0.033541,0.024677,0.339806,0.250000,0.008385
136216,blood,vampire,9.392439,0.026354,0.024197,0.247525,0.227273,0.005989
139108,'',``,9.269346,0.082894,0.107571,0.768374,0.997110,0.082655
22010,matters,worse,9.264027,0.021083,0.030666,0.195312,0.284091,0.005989
99729,crew,ship,8.663945,0.035697,0.023239,0.309278,0.201342,0.007187
18814,hero,heroes,8.408342,0.023239,0.020843,0.195402,0.175258,0.004073
190600,society,soul,7.658716,0.022760,0.026114,0.174312,0.200000,0.004552
110306,hidden,village,7.637327,0.021083,0.028270,0.161017,0.215909,0.004552
119226,break,summer,7.568907,0.022520,0.021083,0.170455,0.159574,0.003594
