# Importing Libraries

In [None]:
import numpy as np 
import pandas as pd
import re
import os
import spacy
import matplotlib.pyplot as plt

In [None]:
!pip install pytextrank
import pytextrank

import nltk
nltk.download('omw-1.4')
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk import pos_tag

nltk.download('stopwords')
from nltk.corpus import stopwords

nltk.download('wordnet')
from nltk.corpus import wordnet

from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

# Importing the Dataset

In [None]:
data = pd.read_csv('reviews.txt', sep='\t')
data.head()

In [None]:
#Dropping the unwanted columns

#mydata = data.drop('Unnamed: 0', axis=1)
#mydata.head()


# Data Pre-Processing

In [None]:
# Cleaning -> Tokenization -> POS tagging -> Stopwords removal -> Lemmatization

# Define a function to clean the text
def clean(text):
    # Removes all special characters and numericals leaving the alphabets
    text = re.sub('[^A-Za-z]+', ' ', text) 
    return text

# Cleaning the text in the review column
mydata['Cleaned Reviews'] = mydata['review'].apply(clean)

pos_dict = {'J':wordnet.ADJ, 'V':wordnet.VERB, 'N':wordnet.NOUN, 'R':wordnet.ADV}

def token_stop_pos(text):
    tags = pos_tag(word_tokenize(text))
    newlist = []
    for word, tag in tags:
        if word.lower() not in set(stopwords.words('english')):
            newlist.append(tuple([word, pos_dict.get(tag[0])]))
    return newlist

mydata['POS tagged'] = mydata['Cleaned Reviews'].apply(token_stop_pos)

def lemmatize(pos_data):
    lemma_rew = " "
    for word, pos in pos_data:
        if not pos: 
            lemma = word
            lemma_rew = lemma_rew + " " + lemma
        else:  
            lemma = wordnet_lemmatizer.lemmatize(word, pos=pos)
            lemma_rew = lemma_rew + " " + lemma
    return lemma_rew
    
mydata['Lemma'] = mydata['POS tagged'].apply(lemmatize)
mydata.head()


# Extracting Keywords and Phrases

In [None]:
import spacy
import pytextrank
nlp = spacy.load('en_core_web_sm')
tr = pytextrank.TextRank()
nlp.add_pipe(tr.PipelineComponent, name='textrank', last=True)

In [None]:
extracted = []

for text in mydata['Lemma']:
    text = nlp(text)
    t = text._.phrases
    extracted.append(t)
    
mydata['Pytextrank_keyword'] = extracted 

mydata['Pytextrank_keyword'] = mydata['Pytextrank_keyword'].agg(lambda x: ','.join(map(str, x)))

In [None]:
New_Data = mydata[['review','sentiment','Pytextrank_keyword']]

In [None]:
New_Data.head()

# Saving the CSV File