In [22]:
import numpy as np
import pandas as pd
import re
import string
import pickle

In [23]:
txt = 'great product. i love it'

In [24]:
def remove_punctuations(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    return text

In [25]:
with open('../static/model/corpora/stopwords/english', 'r') as file:
    sw = file.read().splitlines()

In [26]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()

In [27]:
def preprocessing(text):
    data = pd.DataFrame([text], columns=['tweet'])
    data["tweet"] = data["tweet"].apply(lambda x: " ".join(x.lower() for x in x.split()))
    data["tweet"] = data["tweet"].apply(lambda x: " ".join(re.sub(r'^https?:\/\/.*[\r\n]*', '', x, flags=re.MULTILINE) for x in x.split()))
    data["tweet"] = data["tweet"].apply(remove_punctuations)
    data["tweet"] = data["tweet"].str.replace(r'\d+', '', regex=True)
    data["tweet"] = data["tweet"].apply(lambda x: " ".join(x for x in x.split() if x not in sw))
    data["tweet"] = data["tweet"].apply(lambda x: " ".join(ps.stem(x) for x in x.split()))
    return data["tweet"]


In [28]:
preprocessed_txt = preprocessing(txt)

In [29]:
txt

'great product. i love it'

In [30]:
preprocessed_txt

0    great product love
Name: tweet, dtype: object

In [31]:
vocab = pd.read_csv('../static/model/vocabulary.txt', header=None)
tokens = vocab[0].tolist()

In [32]:
tokens

['test',
 'android',
 'app',
 'beauti',
 'cute',
 'health',
 'iger',
 'iphoneonli',
 'iphonesia',
 'iphon',
 'final',
 'case',
 'thank',
 'yay',
 'soni',
 'xperia',
 'love',
 'would',
 'go',
 'talk',
 'relax',
 'smartphon',
 'wifi',
 'connect',
 'im',
 'know',
 'made',
 'way',
 'home',
 'amaz',
 'servic',
 'appl',
 'wont',
 'even',
 'question',
 'pay',
 'stupid',
 'support',
 'softwar',
 'updat',
 'fuck',
 'phone',
 'big',
 'time',
 'happi',
 'us',
 'instap',
 'instadaili',
 'xperiaz',
 'new',
 'type',
 'c',
 'charger',
 'cabl',
 'uk',
 '…',
 'amazon',
 'year',
 'newyear',
 'start',
 'technolog',
 'samsunggalaxi',
 'iphonex',
 'shop',
 'listen',
 'music',
 'likeforlik',
 'photo',
 'fun',
 'selfi',
 'water',
 'camera',
 'picoftheday',
 'sun',
 'instagood',
 'boy',
 'outdoor',
 'hey',
 'make',
 'ipod',
 'dont',
 'color',
 'inch',
 'crash',
 'everi',
 'need',
 'realli',
 'drop',
 'ball',
 'design',
 'give',
 'anoth',
 'crazi',
 'purchas',
 'lol',
 'work',
 'hard',
 'play',
 'ipad',
 'batt

In [33]:
##vocabulary instead tokens, ds instead preprocessed_txt

In [34]:
def vectorizer(ds, vocabulary):
    vectorized_lst = []  # List to store the vectorized sentences

    # Iterate through each sentence in the dataset
    for sentence in ds:
        sentence_lst = np.zeros(len(vocabulary))  # Initialize a zero vector of vocabulary size

        # Iterate through the vocabulary
        for i in range(len(vocabulary)):
            # If the vocabulary word is in the sentence, set the corresponding index to 1
            if vocabulary[i] in sentence.split():
                sentence_lst[i] = 1

        vectorized_lst.append(sentence_lst)  # Append the vectorized sentence to the list

    # Convert the list of vectors to a NumPy array of float32 type
    vectorized_lst_new = np.asarray(vectorized_lst, dtype=np.float32)
    return vectorized_lst_new

In [35]:
vectorized_txt = vectorizer(preprocessed_txt,tokens)

In [36]:
vectorized_txt

array([[0., 0., 0., ..., 0., 0., 0.]], shape=(1, 1154), dtype=float32)

In [37]:
with open('../static/model/model.pickle','rb')as f:
    model = pickle.load(f)

In [38]:
model.predict(vectorized_txt)

array([0])

In [41]:
def get_prediction(vectorized_txt):
    prediction = model.predict(vectorized_txt)
    if prediction == 1:
       return 'negative'
    else:
       return 'positive'

In [53]:
txt = "Good product. I love it"
preprocessed_txt = preprocessing(txt)
vectorized_txt = vectorizer(preprocessed_txt, tokens)
prediction = get_prediction(vectorized_txt)
prediction


'positive'