# Pre processing and cleaning the data

In [81]:
#importing all the necessary library
import numpy as np
import pandas as pd
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import textblob
from textblob import TextBlob, Word
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\44758\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [82]:
#user deifned functions

def avg_word_len (sentence):
    words = sentence.split()
    avg_len = sum(len(word) for word in words)/len(words)
    return avg_len

def extract_ngrams(data, num):
    '''
    Function to generate n-grams from sentences
    '''
    n_grams = TextBlob(data).ngrams(num)
    return [ ' '.join(grams) for grams in n_grams]

In [83]:
train = pd.read_csv('final roberta sentiment.csv')

In [84]:
print(train.shape)
train.head()

(7102, 4)


Unnamed: 0,ID,location,review,sentiment
0,1,Syambhunath,It is at the top of valleys mountain. Best pl...,1
1,2,Syambhunath,This place has a significant importance in Bud...,1
2,3,Syambhunath,Visited this from the other side on a rainy ev...,1
3,4,Syambhunath,A beautiful temple situated in the capital wit...,1
4,5,Syambhunath,"great, beautiful, historic & religious place.....",1


In [85]:
df_train = train.copy()


In [86]:
#feature extraction
#character count

df_train['char_count'] = df_train['review'].str.len()
df_train_sort_charcount = df_train.sort_values(by='char_count', ascending=False)
df_train_sort_charcount[['review', 'char_count']].head()

Unnamed: 0,review,char_count
6403,Will remind fellow Indians of the shacks on Go...,339
4560,It is proud of Nepal. It is the birth place of...,338
384,I recently visited this very culturally rich h...,338
3771,"Amasing location for trekking, mutlie routes a...",338
1836,It is a place which has temples - some of whic...,337


In [87]:
#word count
df_train['word_count'] = df_train['review'].apply(lambda x: len(str(x).split(" ")))
df_train_sort_wordcount = df_train.sort_values(by='word_count', ascending=False)
df_train_sort_wordcount[['review','word_count']].head()

Unnamed: 0,review,word_count
3649,"Calm, peaceful and less touristic park! We hik...",104
3536,I couldn't get an uber up to the top of the mo...,70
6950,It's the most amazing place to hangout in pokh...,68
4343,The birth place of lord bouddha . It is a prid...,67
1691,One of my favourite place to go & chill with f...,67


In [88]:
#special character count
# Number of hashtags in a tweet

df_train['hashtags'] = df_train['review'].apply(lambda x: len([x for x in x.split() if x.startswith('#')]))
df_train_sort_hashtags = df_train.sort_values(by='hashtags', ascending=False)
df_train_sort_hashtags[['review', 'hashtags']].head()

Unnamed: 0,review,hashtags
4871,#Historical palace #peaceful Environment #Beli...,9
3102,It's an outstanding place of full and natural ...,5
772,One of the #WorldHeritages ultimate #HinduShri...,4
2434,#Bhaktapur #Patan #Kathmandu these three place...,4
4725,#World_Peace\n#World_Heritage_Site\n#Birth_Pla...,3


In [89]:
#stopword count
stop_words = stopwords.words('english')

df_train['stopwords'] = df_train['review'].apply(lambda x: len([i for i in x.split() if i in stop_words]))
df_train_sort_stopwords = df_train.sort_values(by='stopwords', ascending=False)
df_train_sort_stopwords[['review', 'stopwords']].head()

Unnamed: 0,review,stopwords
3536,I couldn't get an uber up to the top of the mo...,38
6950,It's the most amazing place to hangout in pokh...,33
485,i have visited here for the first time and all...,33
1915,even for short 10mins trip out it feels good t...,30
454,A superb site and great views over the city. T...,30


In [90]:
#number count
df_train['number_count'] = df_train['review'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))
df_train_sort_number_count = df_train.sort_values(by='number_count', ascending=False)
df_train_sort_number_count[['review, 'number_count']].head()

SyntaxError: invalid syntax (Temp/ipykernel_3296/791664848.py, line 4)

In [91]:
#uppercase word count
df_train['upper_word'] = df_train['review'].apply(lambda x: len([x for x in x.split() if x.isupper()]))
df_train_sort_uppercase = df_train.sort_values(by='upper_word', ascending=False)
df_train_sort_uppercase[['review', 'upper_word']].head()

Unnamed: 0,review,upper_word
3429,PLEASE WAS BEAUTIFUL BUT THERE PEOPLE ARE SO R...,27
3623,(Translated by Google) THE HIGHEST MOUNTAIN IN...,25
1266,NATIONAL PARK LOCATED ON THE WAY TO KATHMANDU ...,24
6282,POKHARA LAKE SIDE FALLS IN ONE OF THE MOST IMP...,24
6225,Purest form of nature is here....one of the go...,16


In [92]:
#average word length
df_train['avg_word_len'] = df_train['review'].apply(lambda x: round(avg_word_len(x),1))
df_train_sort_avg_word_len = df_train.sort_values(by='avg_word_len', ascending=True)
df_train_sort_avg_word_len[['review', 'avg_word_len']].head()

Unnamed: 0,review,avg_word_len
3941,üòç,1.0
1432,H a y a t,1.0
5205,üíõ,1.0
525,üôè,1.0
4726,üíú,1.0


**Extract features using NLP techniques below**


In [93]:
data = df_train['review'][0]
 
print("1-gram: ", extract_ngrams(data, 1))
print("2-gram: ", extract_ngrams(data, 2))
print("3-gram: ", extract_ngrams(data, 3))
print("4-gram: ", extract_ngrams(data, 4))

1-gram:  ['It', 'is', 'at', 'the', 'top', 'of', 'valleys', 'mountain', 'Best', 'place', 'to', 'get', 'a', 'pleasure', 'I', 'really', 'love', 'the', 'place', 'We', 'can', 'see', 'whole', 'Kathmandu', 'valley', 'from', 'there', 'Best', 'to', 'visit', 'there', 'once', 'in', 'a', 'life', 'We', 'must', 'visit', 'there', 'in', 'life', 'to', 'get', 'some', 'Best', 'experience', 'in', 'life', '‚Ä¶']
2-gram:  ['It is', 'is at', 'at the', 'the top', 'top of', 'of valleys', 'valleys mountain', 'mountain Best', 'Best place', 'place to', 'to get', 'get a', 'a pleasure', 'pleasure I', 'I really', 'really love', 'love the', 'the place', 'place We', 'We can', 'can see', 'see whole', 'whole Kathmandu', 'Kathmandu valley', 'valley from', 'from there', 'there Best', 'Best to', 'to visit', 'visit there', 'there once', 'once in', 'in a', 'a life', 'life We', 'We must', 'must visit', 'visit there', 'there in', 'in life', 'life to', 'to get', 'get some', 'some Best', 'Best experience', 'experience in', 'in l

**Term Frequency**

In [94]:
tf = df_train['review'][1:2].apply(lambda x: pd.value_counts(x.split())/len(x.split())).sum(axis=0).reset_index()
tf.columns = ['words', 'tf']
tf

Unnamed: 0,words,tf
0,place,0.073171
1,this,0.04878
2,of,0.04878
3,visit,0.02439
4,day,0.02439
5,light.,0.02439
6,And,0.02439
7,be,0.02439
8,aware,0.02439
9,monkeys.,0.02439


**Inverse Document Frequency**


IDF = log(N/n), where, N is the total number of rows and n is the number of rows in which the word was present

In [95]:
for i,word in enumerate(tf['words']):
    tf.loc[i, 'idf'] = np.log(df_train.shape[0]/(len(df_train[df_train['review'].str.contains(word)])))    
tf

Unnamed: 0,words,tf,idf
0,place,0.073171,0.738957
1,this,0.04878,2.403543
2,of,0.04878,0.833501
3,visit,0.02439,1.617496
4,day,0.02439,3.10608
5,light.,0.02439,4.89784
6,And,0.02439,4.449291
7,be,0.02439,1.15703
8,aware,0.02439,5.923693
9,monkeys.,0.02439,3.891398


**TF-IDF**

In [96]:
tfidf = TfidfVectorizer(max_features=10000, lowercase=True, analyzer='word', stop_words= 'english',ngram_range=(1,1))
df_train_tfidf = tfidf.fit_transform(df_train['review'])
df_train_tfidf

<7102x7499 sparse matrix of type '<class 'numpy.float64'>'
	with 67173 stored elements in Compressed Sparse Row format>

**Bag of Words**

In [97]:
bag_of_words = CountVectorizer(max_features=10000, lowercase=True, ngram_range=(1,1),analyzer = "word")
df_train_bag_of_words = bag_of_words.fit_transform(df_train['review'])
df_train_bag_of_words

<7102x7769 sparse matrix of type '<class 'numpy.int64'>'
	with 113946 stored elements in Compressed Sparse Row format>

**Sentiment Analysis**


In [98]:
df_train['sentiment2'] = df_train['review'][:20].apply(lambda x: TextBlob(x).sentiment[0])
df_train[['review','sentiment2']].head(5)

Unnamed: 0,review,sentiment2
0,It is at the top of valleys mountain. Best pl...,0.7
1,This place has a significant importance in Bud...,0.329167
2,Visited this from the other side on a rainy ev...,0.266667
3,A beautiful temple situated in the capital wit...,0.55
4,"great, beautiful, historic & religious place.....",0.5


In [99]:
df_train.head(3)


Unnamed: 0,ID,location,review,sentiment,char_count,word_count,hashtags,stopwords,upper_word,avg_word_len,sentiment2
0,1,Syambhunath,It is at the top of valleys mountain. Best pl...,1,242,51,0,19,1,3.9,0.7
1,2,Syambhunath,This place has a significant importance in Bud...,1,229,41,0,18,0,4.6,0.329167
2,3,Syambhunath,Visited this from the other side on a rainy ev...,1,240,43,0,20,0,4.6,0.266667


**Pre-processing**


In [100]:
# Creating a copy of dataset to preprocess the data

df_train_dpp = df_train.copy()

In [101]:
df_train_dpp['review_lower'] = df_train_dpp['review'].apply(lambda x: " ".join(x.lower() for x in x.split()))
df_train_dpp[['review', 'review_lower']].head()

Unnamed: 0,review,review_lower
0,It is at the top of valleys mountain. Best pl...,it is at the top of valleys mountain. best pla...
1,This place has a significant importance in Bud...,this place has a significant importance in bud...
2,Visited this from the other side on a rainy ev...,visited this from the other side on a rainy ev...
3,A beautiful temple situated in the capital wit...,a beautiful temple situated in the capital wit...
4,"great, beautiful, historic & religious place.....","great, beautiful, historic & religious place....."


**Stopwords Removal**

In [102]:
stop_words = stopwords.words('english')

df_train_dpp['review_stopwords'] = df_train_dpp['review_lower'].apply(lambda x: " ".join(x for x in x.split() if x not in stop_words))
df_train_dpp[['review', 'review_stopwords']].head()

Unnamed: 0,review,review_stopwords
0,It is at the top of valleys mountain. Best pl...,top valleys mountain. best place get pleasure....
1,This place has a significant importance in Bud...,place significant importance buddhism. visited...
2,Visited this from the other side on a rainy ev...,"visited side rainy evening, actually visit tem..."
3,A beautiful temple situated in the capital wit...,beautiful temple situated capital stunning vie...
4,"great, beautiful, historic & religious place.....","great, beautiful, historic & religious place....."


**Punctuation Removal**

In [103]:
df_train_dpp['review_punc'] = df_train_dpp['review_stopwords'].str.replace('[^\w\s]', '')
df_train_dpp[['review', 'review_punc']].head()

  df_train_dpp['review_punc'] = df_train_dpp['review_stopwords'].str.replace('[^\w\s]', '')


Unnamed: 0,review,review_punc
0,It is at the top of valleys mountain. Best pl...,top valleys mountain best place get pleasure r...
1,This place has a significant importance in Bud...,place significant importance buddhism visited ...
2,Visited this from the other side on a rainy ev...,visited side rainy evening actually visit temp...
3,A beautiful temple situated in the capital wit...,beautiful temple situated capital stunning vie...
4,"great, beautiful, historic & religious place.....",great beautiful historic religious place crow...


**Common word removal**

In [104]:
# Frequency of common words in all the tweets

common_top20 = pd.Series(' '.join(df_train_dpp['review_punc']).split()).value_counts()[:20]
print(common_top20)


# Remove these top 20 freq words
common = list(common_top20.index)

df_train_dpp['review_comm_remv'] = df_train_dpp['review_punc'].apply(lambda x: " ".join(x for x in x.split() if x not in common))
df_train_dpp[['review','review_comm_remv']].head()

place        4014
visit        1339
temple       1243
beautiful    1074
one          1071
nepal        1024
kathmandu     800
best          761
good          675
park          634
nice          579
see           577
great         559
world         545
amazing       532
view          512
national      510
heritage      485
must          463
site          446
dtype: int64


Unnamed: 0,review,review_comm_remv
0,It is at the top of valleys mountain. Best pl...,top valleys mountain get pleasure really love ...
1,This place has a significant importance in Bud...,significant importance buddhism visited sunset...
2,Visited this from the other side on a rainy ev...,visited side rainy evening actually end mostly...
3,A beautiful temple situated in the capital wit...,situated capital stunning vies city insight cu...
4,"great, beautiful, historic & religious place.....",historic religious crowded visitors around poi...


**Rare words removal**

In [105]:
# Frequency of common words in all the tweets
rare_top20 = pd.Series(" ".join(df_train_dpp['review_comm_remv']).split()).value_counts()[-20:]
rare_top20

# Remove these top 20 common words
rare = list(rare_top20.index)

df_train_dpp['review_rare_remv'] = df_train_dpp['review_comm_remv'].apply(lambda x: " ".join(x for x in x.split() if x not in rare))
df_train_dpp[['review','review_rare_remv']].head()

Unnamed: 0,review,review_rare_remv
0,It is at the top of valleys mountain. Best pl...,top valleys mountain get pleasure really love ...
1,This place has a significant importance in Bud...,significant importance buddhism visited sunset...
2,Visited this from the other side on a rainy ev...,visited side rainy evening actually end mostly...
3,A beautiful temple situated in the capital wit...,situated capital stunning vies city insight cu...
4,"great, beautiful, historic & religious place.....",historic religious crowded visitors around poi...


**Spelling correction**

In [106]:
# Using textblob

df_train_dpp['review_rare_remv'][:10].apply(lambda x: str(TextBlob(x).correct()))

0    top valleys mountain get pleasure really love ...
1    significant importance buddhist visited sunset...
2    visited side rainy evening actually end mostly...
3    situated capital stunning view city insight cu...
4    historic religious crowded visitors around poi...
5    pleased pleasures touches directly soul look r...
6    staying tm go early morning morning walk thing...
7    swayambhunath steps crowning glories valley ar...
8    its would definitely recommend mind like valle...
9    steps located top mountain enjoy city skyline ...
Name: review_rare_remv, dtype: object

**Tokenization**

In [107]:
df_train_dpp['review_rare_remv'][:10].apply(lambda x: TextBlob(x).words)

0    [top, valleys, mountain, get, pleasure, really...
1    [significant, importance, buddhism, visited, s...
2    [visited, side, rainy, evening, actually, end,...
3    [situated, capital, stunning, vies, city, insi...
4    [historic, religious, crowded, visitors, aroun...
5    [pleased, pleasures, touches, directly, soul, ...
6    [staying, ktm, go, early, morning, morning, wa...
7    [swayambhunath, stupa, crowning, glories, vall...
8    [its, would, definitely, recommend, mini, hike...
9    [stupa, located, top, mountain, enjoy, city, s...
Name: review_rare_remv, dtype: object

**Stemming**

In [108]:
st = PorterStemmer()
df_train_dpp['review_rare_remv'][:10].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))

0    top valley mountain get pleasur realli love wh...
1    signific import buddhism visit sunset recommen...
2    visit side raini even actual end mostli crowdi...
3    situat capit stun vie citi insight cultur offe...
4    histor religi crowd visitor around point amaz ...
5    pleas pleasur touch directli soul look recomme...
6          stay ktm go earli morn morn walk thing wear
7    swayambhunath stupa crown glori valley archite...
8    it would definit recommend mini hike valley vi...
9    stupa locat top mountain enjoy citi skylin ple...
Name: review_rare_remv, dtype: object

**Lemmatization**

In [109]:
df_train_dpp['review_rare_remv'][:10].apply(lambda x: " ".join(Word(word) for word in x.split()))

0    top valleys mountain get pleasure really love ...
1    significant importance buddhism visited sunset...
2    visited side rainy evening actually end mostly...
3    situated capital stunning vies city insight cu...
4    historic religious crowded visitors around poi...
5    pleased pleasures touches directly soul look r...
6    staying ktm go early morning morning walk thin...
7    swayambhunath stupa crowning glories valley ar...
8    its would definitely recommend mini hike valle...
9    stupa located top mountain enjoy city skyline ...
Name: review_rare_remv, dtype: object

In [110]:
df_train_dpp

Unnamed: 0,ID,location,review,sentiment,char_count,word_count,hashtags,stopwords,upper_word,avg_word_len,sentiment2,review_lower,review_stopwords,review_punc,review_comm_remv,review_rare_remv
0,1,Syambhunath,It is at the top of valleys mountain. Best pl...,1,242,51,0,19,1,3.9,0.700000,it is at the top of valleys mountain. best pla...,top valleys mountain. best place get pleasure....,top valleys mountain best place get pleasure r...,top valleys mountain get pleasure really love ...,top valleys mountain get pleasure really love ...
1,2,Syambhunath,This place has a significant importance in Bud...,1,229,41,0,18,0,4.6,0.329167,this place has a significant importance in bud...,place significant importance buddhism. visited...,place significant importance buddhism visited ...,significant importance buddhism visited sunset...,significant importance buddhism visited sunset...
2,3,Syambhunath,Visited this from the other side on a rainy ev...,1,240,43,0,20,0,4.6,0.266667,visited this from the other side on a rainy ev...,"visited side rainy evening, actually visit tem...",visited side rainy evening actually visit temp...,visited side rainy evening actually end mostly...,visited side rainy evening actually end mostly...
3,4,Syambhunath,A beautiful temple situated in the capital wit...,1,236,42,0,18,1,4.6,0.550000,a beautiful temple situated in the capital wit...,beautiful temple situated capital stunning vie...,beautiful temple situated capital stunning vie...,situated capital stunning vies city insight cu...,situated capital stunning vies city insight cu...
4,5,Syambhunath,"great, beautiful, historic & religious place.....",1,164,24,0,8,0,5.3,0.500000,"great, beautiful, historic & religious place.....","great, beautiful, historic & religious place.....",great beautiful historic religious place crow...,historic religious crowded visitors around poi...,historic religious crowded visitors around poi...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7097,7267,Pokhara,"It's a nice place to sit back, and enjoy. The ...",1,246,47,0,19,0,4.3,,"it's a nice place to sit back, and enjoy. the ...","nice place sit back, enjoy. calm fresh air lak...",nice place sit back enjoy calm fresh air lake ...,sit back enjoy calm fresh air lake especially ...,sit back enjoy calm fresh air lake especially ...
7098,7268,Pokhara,"Excellent Place to visit, Lifetime memories",1,43,6,0,1,0,6.3,,"excellent place to visit, lifetime memories","excellent place visit, lifetime memories",excellent place visit lifetime memories,excellent lifetime memories,excellent lifetime memories
7099,7269,Pokhara,It's very photogenic and relaxing when there a...,1,68,11,0,6,0,5.3,,it's very photogenic and relaxing when there a...,photogenic relaxing many people.,photogenic relaxing many people,photogenic relaxing many people,photogenic relaxing many people
7100,7270,Pokhara,U can get real definition of nature's beauty a...,1,60,11,0,4,1,4.5,,u can get real definition of nature's beauty a...,u get real definition nature's beauty peace,u get real definition natures beauty peace,u get real definition natures beauty peace,u get real natures beauty peace


In [112]:
df_train_dpp.drop(columns = ['char_count','word_count','hashtags','stopwords','upper_word','avg_word_len','sentiment2','review_punc','review_comm_remv','review_rare_remv' ], axis=1, inplace=True)


In [113]:
df_train_dpp

Unnamed: 0,ID,location,review,sentiment,review_lower,review_stopwords
0,1,Syambhunath,It is at the top of valleys mountain. Best pl...,1,it is at the top of valleys mountain. best pla...,top valleys mountain. best place get pleasure....
1,2,Syambhunath,This place has a significant importance in Bud...,1,this place has a significant importance in bud...,place significant importance buddhism. visited...
2,3,Syambhunath,Visited this from the other side on a rainy ev...,1,visited this from the other side on a rainy ev...,"visited side rainy evening, actually visit tem..."
3,4,Syambhunath,A beautiful temple situated in the capital wit...,1,a beautiful temple situated in the capital wit...,beautiful temple situated capital stunning vie...
4,5,Syambhunath,"great, beautiful, historic & religious place.....",1,"great, beautiful, historic & religious place.....","great, beautiful, historic & religious place....."
...,...,...,...,...,...,...
7097,7267,Pokhara,"It's a nice place to sit back, and enjoy. The ...",1,"it's a nice place to sit back, and enjoy. the ...","nice place sit back, enjoy. calm fresh air lak..."
7098,7268,Pokhara,"Excellent Place to visit, Lifetime memories",1,"excellent place to visit, lifetime memories","excellent place visit, lifetime memories"
7099,7269,Pokhara,It's very photogenic and relaxing when there a...,1,it's very photogenic and relaxing when there a...,photogenic relaxing many people.
7100,7270,Pokhara,U can get real definition of nature's beauty a...,1,u can get real definition of nature's beauty a...,u get real definition nature's beauty peace


In [114]:
df_train_dpp.to_csv('Cleaned  and Preprocessed Dataset.csv', index=False)
