Text Annotation _ Reddit by Abrar

In [1]:
# Import packages
import csv
import pandas as pd
import numpy as np
import nltk 
import re
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize

In [2]:
# Load dataset
dtypes = {'type':'category','subreddit':'category','score':'float64','title':'category','reviews':'category','author':'category'}
ds= pd.read_excel("reddit_FINAL_v12.xlsx", sheet_name="Sheet1", engine='openpyxl', dtype=dtypes)


In [3]:
# Text preprocessing
def textPreProcess(rawText, removeHTML=True, charsToRemove = r'\?|\.|\!|\;|\.|\"|\,|\(|\)|\&|\:|\-', removeNumbers=True, removeLineBreaks=False, specialCharsToRemove = r'[^\x00-\xfd]', convertToLower=True, removeConsecutiveSpaces=True):
    if type(rawText) != str:
        return rawText
    procText = rawText
        
    # Remove HTML
    if removeHTML:
        procText = BeautifulSoup(procText,'html.parser').get_text()

    # Remove punctuation and other special characters
    if len(charsToRemove)>0:
        procText = re.sub(charsToRemove,' ',procText)

    # Remove numbers
    if removeNumbers:
        procText = re.sub(r'\d+',' ',procText)

    # Remove line breaks
    if removeLineBreaks:
        procText = procText.replace('\n',' ').replace('\r', '')

    # Remove special characters
    if len(specialCharsToRemove)>0:
        procText = re.sub(specialCharsToRemove,' ',procText)

    # Normalize to lower case
    if convertToLower:
        procText = procText.lower() 

    # Replace multiple consecutive spaces with just one space
    if removeConsecutiveSpaces:
        procText = re.sub(' +', ' ', procText)

    return procText

In [4]:
# Tokenize words
def tokenize_words(words):
    if (type(words) != str) or (word_tokenize(words) == ''):
        return np.nan
    else:
        return word_tokenize(words)

In [5]:
# Create a dataframe with only the description
processedReviews =  pd.DataFrame(data=ds.reviews.apply(textPreProcess).values, index=ds.index, columns=['PreProcessedText']) 



In [6]:
# Tokenize text
processedReviews['Words'] =  processedReviews['PreProcessedText'].apply(tokenize_words)

In [12]:
import nltk
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\asifa\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [16]:
for i, review in enumerate(processedReviews['Words']):
    if isinstance(review, float):
        print(f'Row {i} contains a float value.')


Row 36 contains a float value.
Row 144 contains a float value.
Row 733 contains a float value.


In [17]:
# Drop rows with NaN values in the "Words" column
processedReviews = processedReviews.dropna(subset=['Words'])

# Extract tokens from the "Words" column
tokens = [word for review in processedReviews['Words'] for word in review]

# Perform POS tagging
tags = nltk.pos_tag(tokens)

# Print the POS tags
print(tags)


[('are', 'VBP'), ('there', 'EX'), ('any', 'DT'), ('i', 'NN'), ('do', 'VBP'), ("n't", 'RB'), ('wan', 'VB'), ('na', 'TO'), ('wait', 'VB'), ('a', 'DT'), ('month', 'NN'), ('to', 'TO'), ('receive', 'VB'), ('it', 'PRP'), ('i', 'JJ'), ('live', 'VBP'), ('in', 'IN'), ('italy', 'NN'), ('if', 'IN'), ('that', 'DT'), ('matters', 'NNS'), ('mujjo', 'VBP'), ('and', 'CC'), ('solo', 'VBP'), ('pelle', 'VB'), ('both', 'DT'), ('make', 'VBP'), ('great', 'JJ'), ('leather', 'NN'), ('cases', 'NNS'), ('and', 'CC'), ('are', 'VBP'), ('available', 'JJ'), ('in', 'IN'), ('europe', 'NN'), ('for', 'IN'), ('example', 'NN'), ('via', 'IN'), ('amazon', 'NN'), ('i', 'NN'), ('had', 'VBD'), ('them', 'PRP'), ('both', 'DT'), ('and', 'CC'), ('like', 'IN'), ('them', 'PRP'), ('a', 'DT'), ('lot', 'NN'), ('i', 'NN'), ('use', 'VBP'), ('mujjo', 'NN'), ('another', 'DT'), ('brand', 'NN'), ('is', 'VBZ'), ('decoded', 'VBN'), ('barely', 'RB'), ('used', 'VBN'), ('my', 'PRP$'), ('phone', 'NN'), ('today', 'NN'), ('but', 'CC'), ('it', 'PRP'),

In [18]:
# Filter only Nouns
nouns = []
for tag in tags:
    if tag[1][0]=="N":  # if if starts with a "N"
        nouns.append(tag[0])
print(nouns)

['i', 'month', 'italy', 'matters', 'leather', 'cases', 'europe', 'example', 'amazon', 'i', 'lot', 'i', 'mujjo', 'brand', 'phone', 'today', 'hours', 'usage', 'devices', 'ipad', 'devices', 'button', 'right', 'photo', 'devices', 'time', 'phone', 'months', 'i', 'times', 'times', 'i', 'phone', 'sos', 'alot', 't', 'answer', 'phone', 'case', 'iphones', 'phone', 'sos', 'way', 'bet', 'problem', 'i', 'phone', 'case', 'phone', 'feet', 'i', 'while', 'area', 'cell', 'service', 'time', 'sos', 'phone', 'store', 'iphone', 'weeks', 'm', 'afraid', 'i', 'earpiece', 'i', 'degrees', 'days', 'way', 'blower', 'jk', 'i', 'people', 'words', 'opinions', 'version', 'attention', 'iphone', 'journey', 'title', 'i', 'iphone', 'se', 'driver', 'camera', 'battery', 'camera', 'i', 'camera', 'person', 'point', 'i', 'screen', 'size', 'fits', 'palm', 'operation', 'day', 'hits', 'hand', 'decent', 'cover', 'look', 'perfect', 'size', 'series', 'feels', 'bulky', 'tbh', 'i', 't', 'wan', 'battery', 'battery', 'i', 'end', 'day', 

In [21]:
# Filter only Adjectives
# Filter adjectives from POS tagged tokens
adjectives = [token[0] for token in tags if token[1].startswith('JJ')]

# Print adjectives
print(adjectives)


['i', 'great', 'available', 'other', 'top', 'first', 'screen', 'i', 'few', 'more', 'few', 'i', 's', 'definitive', 'good', 'durable', 'invincible', 'i', 'uag', 'few', 'limited', 'only', 'i', 'i', 'i', 'like', 'i', 'bad', 'fine', 'im', 'hard', 'fine', 'subjective', 'different', 'worthy', 'daily', 'poor', 'small', 'screen', 'smaller', 'other', 'plus', 'much', 'i', 'frikkin', 'manageable', 'single', 'everyday', 'nostalgia', 'super', 'comfortable', 'bionic', 'se', 'smaller', 'bigger', 'free', 'sleeping', 'smaller', 'main', 'whatsapp', 'screen', 'wasteful', 'less', 'screen', 'more', 'productive', 'good', 'free', 'better', 'few', 'worth', 'more', 'free', 'social', 'most', 's', 'great', 'i', 'lol', 't', 'great', 'full', 'mid', 'heavy', 'usage', 'ive', 'wont', 'easy', 'full', 'heavy', 'weird', 'common', 'noticeable', 'rear', 'i', 'own', 'white', 'other', 'weird', 'i', 'normal', 'white', 'legs', 'direct', 'rear', 'thick', 'ok', 'i', 'first', 'pro', 'mine', 'i', 'comprehensive', 'new', 'hard', 'o

In [22]:
## Filter only Verbs
import nltk

# Tokenize a sentence
sentence = "The quick brown fox jumps over the lazy dog"
tokens = nltk.word_tokenize(sentence)

# Tag the tokens with POS tags
pos_tags = nltk.pos_tag(tokens)

# Filter out only the verbs
verbs = [token for token, tag in pos_tags if tag.startswith('VB')]

print(verbs)
# Output: ['jumps']


['jumps']
