# Loading data

In [124]:
import pandas as pd
import numpy as np

df = pd.read_csv('post_dataset.csv')

df

Unnamed: 0,Post ID,Community ID,Community Name,Author ID,Author Username,Created At,Title,Content
0,15,62,hello-pet,1,hansome,2024-02-16 15:24:34.994040+00:00,hello,good morning
1,16,62,hello-pet,448,aaron94,2024-02-29 17:01:47.086217+00:00,The dog parks are doubling,My city tried out a new off-leash dog park for...
2,17,63,dogs lover,86,aaronphillips,2024-02-29 17:03:01.017136+00:00,"Held prisoner by dogs, waiting to get attacked.",My neighbors don't keep their dogs contained. ...
3,18,63,dogs lover,317,abbottjulie,2024-02-29 17:03:21.637885+00:00,Dogs in ABC's,I'm sick of this crazy dog culture! I'm lookin...
4,19,64,petmandu,137,abigailhill,2024-02-29 17:03:47.620809+00:00,Tried dating a guy with a dog (cautionary tale),Alright so. Ill spare you the deets and give y...
...,...,...,...,...,...,...,...,...
71,86,66,smart pet,179,williamsjennifer,2024-02-29 17:28:30.569599+00:00,Pet safe rug cleaning solution?,Got myself a nice rug cleaner recently. As we ...
72,87,62,hello-pet,390,jennifer62,2024-02-29 17:28:49.710914+00:00,Are there any ethically “caged pets”,"I adore animals, my current landlord only allo..."
73,88,63,dogs lover,329,cheryldaniel,2024-02-29 17:30:49.925274+00:00,Why are dog owners so sensitive?,Guy I was talking to kept referring to his dog...
74,89,63,dogs lover,244,tara46,2024-02-29 17:31:18.785730+00:00,A dog's life is more precious than a human life,I went to my auto insurance office to renegoti...


## creating a tags of all words

In [125]:
df["Content"] = df["Content"].apply(lambda x:x.split())
df["Title"] = df["Title"].apply(lambda x:x.split())

In [None]:
df["tags"] =df["Content"] + df["Title"]


In [None]:
df["tags"] = df["tags"].apply(lambda x: " ".join(x))


# Preprocessing the data

## changing to lower case

In [129]:
@np.vectorize
def to_lower(text):
    return text.lower()

In [131]:
df["tags"]=to_lower(df["tags"])


## remove html text patterns

In [132]:
import re

@np.vectorize
def remove_html_tags(text):
    pattern = re.compile("<.*?>")
    return pattern.sub(r"", text)


In [133]:
df["tags"] = remove_html_tags(df["tags"])


## remove punctuation

In [134]:
import string


exclude = string.punctuation

print(exclude)
@np.vectorize
def remove_punc(text):
    clean_text = str.maketrans("", "", exclude)
   
    return text.translate(clean_text)


!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [135]:
remove_punc("ask @ram and @hari for help")

array('ask ram and hari for help', dtype='<U25')

In [137]:
df["tags"] = remove_punc(df["tags"])



## handeling slang words

In [138]:
abbreviations_dict = {}

with open("./slang.txt", "r") as file:
    for line in file:
        abbreviation, full_form = map(str.strip, line.lower().split("="))

        abbreviations_dict[abbreviation] = full_form

@np.vectorize
def handle_slang(text):
    for abbreviation, full_form in abbreviations_dict.items():
        text = text.replace(abbreviation, full_form)
    return text

In [139]:
handle_slang("it was mistake afaik")

array('it was mistake as far as i know', dtype='<U31')

In [140]:
df["tags"] = handle_slang(df["tags"])



## handeling emojis

In [171]:
import emoji

@np.vectorize
def handle_emoji(text):
    text_with_emojis = emoji.demojize(text)
    return text_with_emojis


In [172]:
handle_emoji("hi there 🔥")

array('hi there :fire:', dtype='<U15')

In [None]:
df["tags"] = handle_emoji(df["tags"])


## correct spelling

In [141]:
from textblob import TextBlob


@np.vectorize
def correct_words(text):
    txtblb = TextBlob(text)
    corrected_text = " ".join(txtblb.correct().words)
    return corrected_text


In [142]:
word = "it is the best splling book"

correct_words(word)

array('it is the best selling book', dtype='<U27')

In [143]:
df["tags"] = correct_words(df["tags"])


## stemming the text

In [144]:
from nltk.stem import PorterStemmer

ps = PorterStemmer()

@np.vectorize
def stemmer(text):
    return ps.stem(text)


In [145]:
stemmer("running"),stemmer("trees"),stemmer("running")

(array('run', dtype='<U3'),
 array('tree', dtype='<U4'),
 array('run', dtype='<U3'))

In [146]:
df["tags"] = stemmer(df["tags"])

## lematize the text

In [161]:
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize




@np.vectorize
def lemmatize_sentence(sentence):
    lemmatizer = WordNetLemmatizer()
    tokenized_words = word_tokenize(sentence)
    lemmatized_words = [lemmatizer.lemmatize(word) for word in tokenized_words]
    lemmatized_sentence = " ".join(lemmatized_words)
    return lemmatized_sentence


In [162]:
lemmatize_sentence("mice are running")

array('mouse are running', dtype='<U17')

In [163]:
df["tags"] = lemmatize_sentence(df["tags"])

# Tokenization

## remove stops words

In [179]:
import nltk
from nltk.corpus import stopwords

nltk.download("stopwords")
total_stop_words,stop_words= len(list(stopwords.words("english"))), list(stopwords.words("english"))


print(f"total stop words: {total_stop_words}")
stop_words


total stop words: 179


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ripple\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

## vectorize using BOW 

In [194]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000, stop_words="english")


In [195]:
vector = cv.fit_transform(df["tags"])

In [197]:
vector.toarray(),vector.toarray().shape


(array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [2, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64),
 (76, 2201))

# Exporting

## exporting processed data

In [218]:
df.to_csv("preprocesses_data.csv", index=False)

## exoprting vector and vectorizer

In [212]:
import pickle

In [215]:
pickle.dump(vector, open("utils/vector.pkl", "wb"))
pickle.dump(cv, open("utils/cv.pkl", "wb"))
