In [2]:
import nltk
nltk.download("popular")

from google.colab import files
uploaded = files.upload()

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/cmudict.zip.
[nltk_data]    | Downloading package gazetteers to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/gazetteers.zip.
[nltk_data]    | Downloading package genesis to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/genesis.zip.
[nltk_data]    | Downloading package gutenberg to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/gutenberg.zip.
[nltk_data]    | Downloading package inaugural to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/inaugural.zip.
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping corpora/movie_reviews.zip.
[nltk_data]    | Downloading package names to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/names.zip.
[nltk_data]    | Downloading package shakespeare to /root/nltk_data...
[nlt

Saving reviews.csv to reviews.csv


In [3]:
import pandas as pd

df = pd.read_csv('reviews.csv')

df
# df['Review']

Unnamed: 0,Time_submitted,Review,Rating,Total_thumbsup,Reply
0,2022-07-09 15:00:00,"Great music service, the audio is high quality...",5,2,
1,2022-07-09 14:21:22,Please ignore previous negative rating. This a...,5,1,
2,2022-07-09 13:27:32,"This pop-up ""Get the best Spotify experience o...",4,0,
3,2022-07-09 13:26:45,Really buggy and terrible to use as of recently,1,1,
4,2022-07-09 13:20:49,Dear Spotify why do I get songs that I didn't ...,1,1,
...,...,...,...,...,...
61589,2022-01-01 03:01:29,Even though it was communicated that lyrics fe...,1,6,
61590,2022-01-01 02:13:40,"Use to be sooo good back when I had it, and wh...",1,0,
61591,2022-01-01 01:02:29,This app would be good if not for it taking ov...,2,10,
61592,2022-01-01 00:49:23,The app is good hard to navigate and won't jus...,2,1,


In [4]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [5]:
def tokenize(column):
    """Tokenizes a Pandas dataframe column and returns a list of tokens.

    Args:
        column: Pandas dataframe column (i.e. df['text']).

    Returns:
        tokens (list): Tokenized list, i.e. [Donald, Trump, tweets]

    """

    tokens = nltk.word_tokenize(column)
    return [w for w in tokens if w.isalpha()]

# Tokenizing

In [7]:
df['tokenized'] = df.apply(lambda x: tokenize(x['Review']), axis=1)
df[['Review', 'tokenized']].head()

Unnamed: 0,Review,tokenized
0,"Great music service, the audio is high quality...","[Great, music, service, the, audio, is, high, ..."
1,Please ignore previous negative rating. This a...,"[Please, ignore, previous, negative, rating, T..."
2,"This pop-up ""Get the best Spotify experience o...","[This, Get, the, best, Spotify, experience, on..."
3,Really buggy and terrible to use as of recently,"[Really, buggy, and, terrible, to, use, as, of..."
4,Dear Spotify why do I get songs that I didn't ...,"[Dear, Spotify, why, do, I, get, songs, that, ..."


# Remove Stopwords

In [9]:
def remove_stopwords(tokenized_column):
    """Return a list of tokens with English stopwords removed.

    Args:
        column: Pandas dataframe column of tokenized data from tokenize()

    Returns:
        tokens (list): Tokenized list with stopwords removed.

    """
    stops = set(stopwords.words("english"))
    return [word for word in tokenized_column if not word in stops]

In [8]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [10]:
df['stopwords_removed'] = df.apply(lambda x: remove_stopwords(x['tokenized']), axis=1)
df[['Review', 'stopwords_removed']].head()

Unnamed: 0,Review,stopwords_removed
0,"Great music service, the audio is high quality...","[Great, music, service, audio, high, quality, ..."
1,Please ignore previous negative rating. This a...,"[Please, ignore, previous, negative, rating, T..."
2,"This pop-up ""Get the best Spotify experience o...","[This, Get, best, Spotify, experience, Android..."
3,Really buggy and terrible to use as of recently,"[Really, buggy, terrible, use, recently]"
4,Dear Spotify why do I get songs that I didn't ...,"[Dear, Spotify, I, get, songs, I, put, playlis..."




# Stemming

In [11]:
def apply_stemming(tokenized_column):
    """Return a list of tokens with Porter stemming applied.

    Args:
        column: Pandas dataframe column of tokenized data with stopwords removed.

    Returns:
        tokens (list): Tokenized list with words Porter stemmed.

    """

    stemmer = PorterStemmer()
    return [stemmer.stem(word) for word in tokenized_column]

In [None]:
df['porter_stemmed'] = df.apply(lambda x: apply_stemming(x['stopwords_removed']), axis=1)
# df[['Review', 'porter_stemmed']].head()

df[['Review', 'porter_stemmed']].tail()

# Testing

In [None]:
df[['Review', 'porter_stemmed']]

# Building Co-occurance Matrix

In [44]:
vocab_len = 0
all_words = []

for index , row in df.iterrows():
  vocab_len += len(row["porter_stemmed"])

  for each in row["porter_stemmed"]:
    all_words.append(each)


all_words = set(all_words)
print(vocab_len) # 1091303

1091303


# Mutual Info of pair words

In [46]:
co_occurence = {}

for index, row in df.iterrows():
  for x in row["porter_stemmed"]:
    for y in row["porter_stemmed"]:
      if x == y:
        if x not in co_occurence.keys():
          co_occurence[x] = 1

        else:
          co_occurence[x] = co_occurence[x] + 1

        continue

      # print(x , y)

      key = (x , y)
      key2 = (y , x)

      if key not in co_occurence.keys():
        co_occurence[key] = 1
        co_occurence[key2] = 1

      else:
        co_occurence[key] = co_occurence[key] + 1
        co_occurence[key2] = co_occurence[key2] + 1

  # if index % 100 == 0 :
  #   print(key , co_occurence[key])
  #   print(key2 , co_occurence[key2])


# print(co_occurence)
  # if index < 10:
    # print(row["Review"], row["porter_stemmed"])

# Mutual Information

In [None]:
import math

mutual_info = {}
vocabulary_length = 1091303

cnt = 0

for x in all_words:
  cnt += 1

  for y in all_words:
    if (x , y) in co_occurence.keys():
      mutual_info[(x , y)] = math.log((vocabulary_length * co_occurence[(x,y)]) / (co_occurence[x] * co_occurence[y]) , 2)

      if cnt % 100 == 0:
        print((x , y) , mutual_info[(x , y)])

# finding sintagmatic relations to 'Fix' word

In [68]:
for each in mutual_info.keys():
  # if 'fix' in each and mutual_info[each] > 9.2:
  #     print(each , mutual_info[each])

  if 'fix' in each and mutual_info[each] < 1:
    print(each , mutual_info[each])

('tast', 'fix') -0.021881420628885068
('beauti', 'fix') 0.8023473199931154
('misinform', 'fix') 0.9590728236805686
('fix', 'tast') -0.021881420628885068
('fix', 'beauti') 0.8023473199931154
('fix', 'misinform') 0.9590728236805686


# finding sintagmatic relations to 'Like' word

In [70]:
for each in mutual_info.keys():
  if 'like' in each and mutual_info[each] > 9.2:
    print(each , mutual_info[each])

('jayeng', 'like') 9.996502283682894
('dedo', 'like') 9.996502283682894
('atol', 'like') 9.318430378570257
('likeit', 'like') 9.318430378570257
('stikl', 'like') 9.8038572057405
('panicland', 'like') 9.58146478440405
('betti', 'like') 9.8038572057405
('artst', 'like') 9.318430378570257
('misto', 'like') 9.318430378570257
('malala', 'like') 9.58146478440405
('clike', 'like') 9.8038572057405
('ldr', 'like') 9.318430378570257
('upstair', 'like') 9.58146478440405
('prevu', 'like') 9.8038572057405
('like', 'jayeng') 9.996502283682894
('like', 'dedo') 9.996502283682894
('like', 'atol') 9.318430378570257
('like', 'likeit') 9.318430378570257
('like', 'stikl') 9.8038572057405
('like', 'panicland') 9.58146478440405
('like', 'betti') 9.8038572057405
('like', 'artst') 9.318430378570257
('like', 'misto') 9.318430378570257
('like', 'malala') 9.58146478440405
('like', 'clike') 9.8038572057405
('like', 'ldr') 9.318430378570257
('like', 'upstair') 9.58146478440405
('like', 'prevu') 9.8038572057405
('li

In [42]:
print(co_occurence['song' , 'fix'])
print(co_occurence['user' , 'fix'])
# print(co_occurence['primum' , 'fix'])
print(co_occurence['playlist' , 'fix'])
print(co_occurence['pretti' , 'fix'])
print(co_occurence['look' , 'fix'])
print('----------------')

print(len(co_occurence))

9288
580
2682
184
364
----------------
2952717
