# Project 2B

In [1]:
!pip install contractions



In [2]:
#Importing required libraries

import re
import contractions
import nltk
import numpy as np
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer 
from nltk import tokenize
from nltk.tokenize import word_tokenize
from nltk.util import ngrams

txt = "This movie made it into one of my top 10 most awful movies. Horrible. There wasn't a continuous minute where there wasn't a fight with one monster or another. There was no chance for any character development, they were too busy running from one sword fight to another. I had no emotional attachment ( except to the big bad machine ## that wanted to destroy them)"

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\manik\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\manik\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\manik\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Data Cleaning

In [3]:
class PreProcess():
    
    def __init__(self,text):
        self.text = text
        self.tokenized = []
        self.stop_word = []
        self.tags = []
        self.expanded_txt = self.remcontra(self.text)
        self.spec = self.remSpecChar(self.expanded_txt)

    def remcontra(self,text:str):

      '''
          This method expands the contractions in the given text
              Input arguments : text (Here the input is the text and this function expands the contractions inside the text 
              ---------------         For Ex: doesn't changes to does not)
              Returns: We get the expanded text
              --------
              '''
      words = []    
      for word in text.split():
        words.append(contractions.fix(word))   
        expanded_text = ' '.join(words)
      return expanded_text
      

    def remSpecChar(self, text:str):
      '''This method removes the special characters along with extra spaces
          Input arguments: text ( In this function all the special characters will be removed and extra spaces will be cleared )
          ---------------     
          Returns: The final text will have cleared special characters.'''

      specChar = re.sub('[^A-Za-z0-9@]+',' ',text)
      return specChar 
      

    def regexp(self):
      ''' 
          This method simplifies the digits and emails using regular expressions.
                Input arguments: text (In this function the digits and email will be cleared by using certain patterns)
                ---------------     
                Returns:  We get simplified text.
          '''
      self.rg=re.sub('[\w\-\.]+@([\w-]+\.)+[\w-]{2,4}','email',self.spec)
      self.rg=re.sub('\d+','',self.rg)
      return self.rg

    def token(self,tokenize_on):

      ''' In this function the whole text will be divided into tokens.
                  By tokenization we can transform indivisible assets into tokens'''

      self.tokenized = nltk.word_tokenize(self.rg)
      return self.tokenized

    def remove_stopwords(self):
      ''' 
        Stopwords are a set of commonly used words such as is, the, are etc.. 
        Input
        -----
        text file as string
        
        Returns
        ------
        list of words without stopwords
      '''
      stop_words = set(stopwords.words('english'))  
      for w in self.tokenized:
        if w notn stop_words:
          self.stop_word.append(w)
      return self.stop_word
      
    def stemmingOrLemmatization(self, method):
      """
      stemmingOrLemmatization(argument) function decides which functions should
      be running based on the input
      """
      if method == 'stem':
        self.out = self.stemming()
      else:
        self.out = self.lemmatization()
      return self.out

    # Stemming


    def stemming(self):
      """
      Stemming() removes suffix from a word and reduce it to 
      its root word.
      """
      stm = nltk.porter.PorterStemmer()
      stword = [stm.stem(word) for word in self.stop_word]
      return stword
    
    # Lemmetization
    def lemmatization(self):
      """
      lemmatization() functions takes input from stemming and reduces it to the
      right word
      """
      lem = WordNetLemmatizer()
      lemout = [lem.lemmatize(word) for word in self.stop_word]
      return lemout
        
        
    def ngram(self,tx):
      ''' 
      In this function it returns a sequence of N items from a given sample of text.
      Here an item can be a character,words,sentence and N can be any integer
              
      '''
      NGRAMS=ngrams(sequence=nltk.word_tokenize(tx), n=5)
      for grams in NGRAMS:
          print(grams)

      
 

Cleaned Data

In [4]:
process = PreProcess(txt)
process.regexp()
#process.getProcessedData()

'This movie made it into one of my top  most awful movies Horrible There was not a continuous minute where there was not a fight with one monster or another There was no chance for any character development they were too busy running from one sword fight to another I had no emotional attachment except to the big bad machine that wanted to destroy them '

Tokenizing

In [5]:
process.token(txt)

['This',
 'movie',
 'made',
 'it',
 'into',
 'one',
 'of',
 'my',
 'top',
 'most',
 'awful',
 'movies',
 'Horrible',
 'There',
 'was',
 'not',
 'a',
 'continuous',
 'minute',
 'where',
 'there',
 'was',
 'not',
 'a',
 'fight',
 'with',
 'one',
 'monster',
 'or',
 'another',
 'There',
 'was',
 'no',
 'chance',
 'for',
 'any',
 'character',
 'development',
 'they',
 'were',
 'too',
 'busy',
 'running',
 'from',
 'one',
 'sword',
 'fight',
 'to',
 'another',
 'I',
 'had',
 'no',
 'emotional',
 'attachment',
 'except',
 'to',
 'the',
 'big',
 'bad',
 'machine',
 'that',
 'wanted',
 'to',
 'destroy',
 'them']

Removing stopwords

In [6]:
process.remove_stopwords()

['This',
 'movie',
 'made',
 'one',
 'top',
 'awful',
 'movies',
 'Horrible',
 'There',
 'continuous',
 'minute',
 'fight',
 'one',
 'monster',
 'another',
 'There',
 'chance',
 'character',
 'development',
 'busy',
 'running',
 'one',
 'sword',
 'fight',
 'another',
 'I',
 'emotional',
 'attachment',
 'except',
 'big',
 'bad',
 'machine',
 'wanted',
 'destroy']

Stemming

In [7]:
process.stemmingOrLemmatization('stem')

['thi',
 'movi',
 'made',
 'one',
 'top',
 'aw',
 'movi',
 'horribl',
 'there',
 'continu',
 'minut',
 'fight',
 'one',
 'monster',
 'anoth',
 'there',
 'chanc',
 'charact',
 'develop',
 'busi',
 'run',
 'one',
 'sword',
 'fight',
 'anoth',
 'i',
 'emot',
 'attach',
 'except',
 'big',
 'bad',
 'machin',
 'want',
 'destroy']

Lemmatization

In [8]:
process.stemmingOrLemmatization('lemm')

['This',
 'movie',
 'made',
 'one',
 'top',
 'awful',
 'movie',
 'Horrible',
 'There',
 'continuous',
 'minute',
 'fight',
 'one',
 'monster',
 'another',
 'There',
 'chance',
 'character',
 'development',
 'busy',
 'running',
 'one',
 'sword',
 'fight',
 'another',
 'I',
 'emotional',
 'attachment',
 'except',
 'big',
 'bad',
 'machine',
 'wanted',
 'destroy']

Ngramming

In [9]:
process.ngram(txt)

('This', 'movie', 'made', 'it', 'into')
('movie', 'made', 'it', 'into', 'one')
('made', 'it', 'into', 'one', 'of')
('it', 'into', 'one', 'of', 'my')
('into', 'one', 'of', 'my', 'top')
('one', 'of', 'my', 'top', '10')
('of', 'my', 'top', '10', 'most')
('my', 'top', '10', 'most', 'awful')
('top', '10', 'most', 'awful', 'movies')
('10', 'most', 'awful', 'movies', '.')
('most', 'awful', 'movies', '.', 'Horrible')
('awful', 'movies', '.', 'Horrible', '.')
('movies', '.', 'Horrible', '.', 'There')
('.', 'Horrible', '.', 'There', 'was')
('Horrible', '.', 'There', 'was', "n't")
('.', 'There', 'was', "n't", 'a')
('There', 'was', "n't", 'a', 'continuous')
('was', "n't", 'a', 'continuous', 'minute')
("n't", 'a', 'continuous', 'minute', 'where')
('a', 'continuous', 'minute', 'where', 'there')
('continuous', 'minute', 'where', 'there', 'was')
('minute', 'where', 'there', 'was', "n't")
('where', 'there', 'was', "n't", 'a')
('there', 'was', "n't", 'a', 'fight')
('was', "n't", 'a', 'fight', 'with')
("

TF-IDF

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
data = [ process.regexp() ]
tfidf = TfidfVectorizer()
result = tfidf.fit_transform(data)

In [11]:
# get indexing
print('\nWord indexes:')
print(tfidf.vocabulary_)
  
print('\ntf-idf values in matrix form:')
print(result.toarray())


Word indexes:
{'this': 41, 'movie': 26, 'made': 22, 'it': 20, 'into': 19, 'one': 32, 'of': 31, 'my': 28, 'top': 44, 'most': 25, 'awful': 3, 'movies': 27, 'horrible': 18, 'there': 39, 'was': 46, 'not': 30, 'continuous': 9, 'minute': 23, 'where': 48, 'fight': 14, 'with': 49, 'monster': 24, 'or': 33, 'another': 0, 'no': 29, 'chance': 7, 'for': 15, 'any': 1, 'character': 8, 'development': 11, 'they': 40, 'were': 47, 'too': 43, 'busy': 6, 'running': 34, 'from': 16, 'sword': 35, 'to': 42, 'had': 17, 'emotional': 12, 'attachment': 2, 'except': 13, 'the': 37, 'big': 5, 'bad': 4, 'machine': 21, 'that': 36, 'wanted': 45, 'destroy': 10, 'them': 38}

tf-idf values in matrix form:
[[0.20628425 0.10314212 0.10314212 0.10314212 0.10314212 0.10314212
  0.10314212 0.10314212 0.10314212 0.10314212 0.10314212 0.10314212
  0.10314212 0.10314212 0.20628425 0.10314212 0.10314212 0.10314212
  0.10314212 0.10314212 0.10314212 0.10314212 0.10314212 0.10314212
  0.10314212 0.10314212 0.10314212 0.10314212 0.10