# Generating Final Dataset


In [None]:
# Import all necessary libraries and packages

import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score

import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

import re, itertools
import nltk
from nltk.corpus import stopwords

import sys
!{sys.executable} -m pip install twokenize
import twokenize

!pip install langid
import langid

!pip install langdetect
from langdetect import detect
from langdetect import detect_langs

In [None]:
# Mount notebook to Google Drive

from google.colab import drive
import os
drive.mount('/content/drive/')

In [None]:
# Read data into pandas DataFrame
Corpus = pd.read_csv(r"/content/drive/My Drive/All_Match_Before_Preprocess.csv")
Corpus.head()

In [None]:
# Make a copy of the Comment column and add it to the DataFrame.
Corpus['Cleaned Comment'] = Corpus['Comment']

# Remove leading and trailing characters. Also remove whitespace and tab characters.
for idx, val in enumerate(Corpus['Cleaned Comment']):
  Corpus['Cleaned Comment'][idx] = val.strip()
  Corpus['Cleaned Comment'][idx] = val.replace('\t',' ')
  Corpus['Cleaned Comment'][idx] = val.replace('\n',' ')

# Remove punctuation.
for idx, val in enumerate(Corpus['Cleaned Comment']):
  Corpus['Cleaned Comment'][idx] = re.sub(r"[^\w\d\s]",'',Corpus['Cleaned Comment'][idx])

# Standardize words.
for idx, val in enumerate(Corpus['Cleaned Comment']):
  Corpus['Cleaned Comment'][idx] = ''.join(''.join(s)[:2] for _, s in itertools.groupby(val))

# Convert text to lowercase.
for idx, val in enumerate(Corpus['Cleaned Comment']):
  Corpus['Cleaned Comment'][idx] = val.lower()

Corpus.head()

In [None]:
# Create a copy of the Cleaned Comment column
Corpus['Lemmatized Token'] = Corpus['Cleaned Comment']

# Load POS-Tags from ark-tweet-nlp tagger into DataFrame
tagger = pd.read_csv('/content/drive/My Drive/taggerOutput.txt', sep="\t", header=None)
tagger.columns = ["Word", "Tag", "Confidence"]
tagger.shape

# Tokenize all comments
for idx, val in enumerate(Corpus['Lemmatized Token']):
  Corpus['Lemmatized Token'][idx] = twokenize.tokenizeRawTweetText(val)
Corpus.head()

In [None]:
# Remove stopwords and non-alpha words. Perform word lemmenting.

tag_map = defaultdict(lambda : wn.NOUN)
tag_map['A'] = wn.ADJ     # This was changed for the new tagger
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV

tag_index = 0

for index,entry in enumerate(Corpus['Lemmatized Token']):
    # Declaring Empty List to store the words that follow the rules for this step
    Final_words = []

    # Initializing WordNetLemmatizer()
    word_Lemmatized = WordNetLemmatizer()

    # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
    for word, tag in pos_tag(entry):
        # Below condition is to check for Stop words and consider only alphabets
        if word not in stopwords.words('english') and word.isalpha():
            word_tag = tagger["Tag"][tag_index]
            if word_tag == "N" or word_tag == "A" or word_tag == "V" or word_tag == "R":
              word_Final = word_Lemmatized.lemmatize(word,tag_map[word_tag])
              Final_words.append(word_Final)
        tag_index = tag_index + 1
        
        
    # The final processed set of words for each iteration will be stored in 'text_final'
    Corpus.loc[index,'Lemmatized Token'] = str(Final_words)

Corpus.head()

In [None]:
# Remove any non-english comments from the dataset

for idx, val in enumerate(Corpus['Comment']):
  line = Corpus['Comment'][idx]
  tmp = langid.classify(line)
  if tmp[0] != 'en':
    if 'buy' not in line and 'Buy' not in line and 'play' not in line and 'Play' not in line and 'download' not in line and 'Download' not in line:
      print(line)
      Corpus.drop(idx,inplace=True)

In [None]:
# Remove usernames and replace them with a unique key

def clean_dataframe(df, columns):
    for col_name in columns:
        keys = {categories: i for i,categories in enumerate(df[col_name].unique())}
        df[col_name] = df[col_name].apply(lambda x: str(keys[x]).zfill(5))
    return df

columns = ['Username']

Corpus = clean_dataframe(Corpus, columns)
Corpus.head()

In [None]:
Corpus.shape

In [None]:
os.chdir('/content/drive/My Drive/')
Corpus.to_csv('FinalDataset.csv', index=False)