<a href="https://colab.research.google.com/github/ShotaroBaba/NLP_Practice/blob/NLP_smaller_dataset/NLPIMDB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# Firstly, download text dataset.:
# This time, IMDB comment dataset has been used for the experiment.
import os
import gensim
import requests 
from gensim.parsing.preprocessing import stem
from gensim.parsing.preprocessing import strip_punctuation

from gensim.corpora import Dictionary
urlToWiki = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
savedFileName = 'aclImdb_v1.tar.gz'
if not os.path.isfile(savedFileName):
  with open(savedFileName,'wb') as output:
    output.write(requests.get(urlToWiki).content)

In [0]:
import tarfile

# Extract tar.gz file
tarfile.open(savedFileName, "r:gz").extractall()

In [0]:
neg_test = [os.path.join(root,x) for root, _, files in os.walk("aclImdb/test/neg/", topdown=False) for x in files if x.endswith(".txt") ]
pos_test  = [os.path.join(root,x) for root, _, files in os.walk("aclImdb/test/pos/", topdown=False) for x in files if x.endswith(".txt") ]

In [27]:
print(neg_test[:10])

['aclImdb/test/neg/4332_2.txt', 'aclImdb/test/neg/3195_2.txt', 'aclImdb/test/neg/7053_1.txt', 'aclImdb/test/neg/9996_2.txt', 'aclImdb/test/neg/3744_4.txt', 'aclImdb/test/neg/1708_1.txt', 'aclImdb/test/neg/7690_3.txt', 'aclImdb/test/neg/11833_4.txt', 'aclImdb/test/neg/11344_1.txt', 'aclImdb/test/neg/5580_1.txt']


In [0]:
neg_train = [os.path.join(root,x) for root, _, files in os.walk("aclImdb/train/neg/", topdown=False) for x in files if x.endswith(".txt")]
pos_train  =[os.path.join(root,x) for root, _, files in os.walk("aclImdb/train/pos/", topdown=False) for x in files if x.endswith(".txt")]

In [0]:
# Return texts from path
# if the text is too short, then it will omit it. 
def fetch_text(path):
  with open(path) as f:
    text = f.read()
    if len(text.split()) < 50:
      return False
    else:
      return text

In [0]:
# Next, the files are retrieved as the list of texts.
neg_test = list(filter(None, [fetch_text(x) for x in neg_test]))
pos_test = list(filter(None, [fetch_text(x) for x in pos_test]))
neg_train = list(filter(None, [fetch_text(x) for x in neg_train]))
pos_train = list(filter(None, [fetch_text(x) for x in pos_train]))

In [0]:
# Aggregate all lists into one:

all_text = neg_test + pos_test + neg_train + pos_train

In [0]:
# Load another necessary libraries and functions
import spacy
nlp = spacy.load('en', disable=['parser', 'ner'])
def lemmatize_sentence(text):
  doc = nlp(text)
  return " ".join([token.lemma_ for token in doc])
from gensim.parsing.preprocessing import preprocess_string, strip_tags, strip_punctuation, strip_multiple_whitespaces, strip_short
from collections import Counter
stopwords = gensim.parsing.preprocessing.STOPWORDS

In [0]:
# Pre-process these given texts, by removing stop words and lemmatization.
all_text_processed = [preprocess_string(lemmatize_sentence(x.lower()), 
                                        filters = [strip_tags, strip_short, 
                                                   strip_punctuation, strip_multiple_whitespaces]) for x in all_text]

In [0]:
# Display how one of them is pre-processed:

all_text_processed = [[x for x in str_list if len(x) > 1 and not x in stopwords and not x == "PRON"] for str_list in all_text_processed]

In [0]:
print(all_text_processed[0])

['wow', 'watch', 'night', 'mccool', 'yesterday', 'wow', 'major', 'spoilers', 'like', 'summary', 'plot', 'stupid', 'pointless', 'movie', 'tell', 'anybody', 'watch', 'inflict', 'pain', 'glimpse', 'huge', 'chunk', 'plot', 'randy', 'work', 'bar', 'mccool', 'meet', 'woman', 'jewel', 'convince', 'surprisingly', 'sex', 'boyfriend', 'end', 'try', 'rob', 'kill', 'randy', 'cousin', 'detective', 'scene', 'crime', 'fall', 'jewel', 'mascot', 'stupidity', 'use', 'guy', 'want', 'involve', 'dvd', 'player', 'randy', 'hire', 'hit', 'man', 'kill', 'eventually', 'detective', 'kill', 'boyfriend', 'psycho', 'brother', 'hit', 'man', 'jewel', 'seriously', 'leave', 'hardly', 'sex', 'scene', 'nearly', 'pornographic', 'scene', 'liv', 'tyl', 'jewel', 'use', 'hose', 'flaunt', 'sexuality', 'point', 'movie', 'honest', 'think', 'producer', 'director', 'male', 'urge', 'think', 'absolutely', 'uncalled', 'plain', 'stupid', 'watch', 'movie', 'want', 'plot', 'want', 'character', 'care', 'sexy', 'woman', 'flaunt', 'happen'

In [0]:
# Create dictionary & BOW:
dictionary = Dictionary(all_text_processed)
corpus = [dictionary.doc2bow(x) for x in all_text_processed]

In [0]:
lda_imdb = gensim.models.LdaMulticore(corpus, id2word=dictionary, num_topics=5)

In [0]:
# You can see that the topic is all about movie and drama series.

lda_imdb.show_topics()

[(0,
  '0.007*"play" + 0.006*"good" + 0.006*"film" + 0.005*"movie" + 0.004*"story" + 0.004*"time" + 0.004*"love" + 0.004*"great" + 0.004*"character" + 0.004*"man"'),
 (1,
  '0.038*"film" + 0.005*"like" + 0.005*"movie" + 0.005*"story" + 0.005*"time" + 0.005*"scene" + 0.005*"character" + 0.004*"good" + 0.004*"plot" + 0.003*"use"'),
 (2,
  '0.058*"movie" + 0.016*"like" + 0.015*"watch" + 0.015*"good" + 0.012*"film" + 0.012*"think" + 0.011*"time" + 0.008*"bad" + 0.007*"great" + 0.007*"people"'),
 (3,
  '0.022*"film" + 0.007*"life" + 0.006*"time" + 0.006*"story" + 0.005*"character" + 0.005*"movie" + 0.005*"man" + 0.005*"good" + 0.005*"like" + 0.004*"work"'),
 (4,
  '0.019*"film" + 0.016*"good" + 0.015*"movie" + 0.011*"like" + 0.007*"great" + 0.007*"character" + 0.007*"scene" + 0.006*"look" + 0.006*"story" + 0.006*"love"')]

In [0]:
# Count numbers of words in corpus.
def count_words_in_corpus(corpus):
  count_dict = Counter() 
  for word_count in corpus:
    for key, value in word_count:
      count_dict.update({dictionary[key] : value})

  return count_dict

In [0]:
count_of_words = count_words_in_corpus(corpus)

In [0]:
sorted_count = sorted(count_of_words.items(), key = lambda x: x[1], reverse = True)

In [0]:
print("\n".join([str(x) for x in sorted_count[:20]]))

('movie', 101922)
('film', 94644)
('like', 43949)
('good', 40883)
('time', 31142)
('character', 27966)
('watch', 27346)
('bad', 26082)
('story', 25046)
('think', 23103)
('scene', 21149)
('great', 19805)
('look', 19542)
('know', 18990)
('people', 18302)
('way', 17120)
('love', 17003)
('play', 16816)
('come', 16369)
('thing', 16334)


In [0]:
def create_corpus(text_to_be_converted):
  all_text_processed = [preprocess_string(lemmatize_sentence(x.lower()), 
                                          filters = [strip_tags, strip_short, 
                                                    strip_punctuation, strip_multiple_whitespaces]) for x in text_to_be_converted]
  all_text_processed = [[x for x in str_list if len(x) > 1 and not x in stopwords and not x == "PRON"] for str_list in all_text_processed]
  count_dict = Counter() 
  for word_list in all_text_processed:
    for word in word_list:
      count_dict.update({word: 1})                      
  return count_dict

In [0]:
# Count words for both two sentiment.
neg = neg_test + neg_train
pos = pos_test + pos_train

pos_processed_count = create_corpus(pos)
neg_processed_count = create_corpus(neg)

In [38]:
 print(sorted(pos_processed_count.items(), key = lambda x: x[1], reverse = True)[:10])

[('aclimdb', 25000), ('pos', 25000), ('txt', 25000), ('test', 12500), ('train', 12500), ('10', 9733), ('7819', 2), ('1610', 2), ('8541', 2), ('3498', 2)]


In [39]:
 print(sorted(neg_processed_count.items(), key = lambda x: x[1], reverse = True)[:10])

[('aclimdb', 25000), ('neg', 25000), ('txt', 25000), ('test', 12500), ('train', 12500), ('4332', 2), ('3195', 2), ('7053', 2), ('9996', 2), ('3744', 2)]


In [0]:
# References
# Andrew L. Maas, Raymond E. Daly, Peter T. Pham, Dan Huang, Andrew Y. Ng, and Christopher Potts. (2011). Learning Word Vectors for Sentiment Analysis. 
# The 49th Annual Meeting of the Association for Computational Linguistics (ACL 2011).