In [None]:
import re
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
import math
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from collections import defaultdict
array_of_labeled_csv=['Bioderma_labeled.csv',
                      'erave_labeled.csv',
                      'neutrogena_labeled.csv',
                      'labeled_mayb_foundation.csv',
                      'labeled_mayb_conc.csv',
                      'labeled_hp_envy.csv',
                      'labeled_Macbook m1.csv',
                      'labeled_lenovo x1.csv',
                      'labeled_lenovo e14.csv',
                      'galaxy7_labeled.csv'
                     ]
for product in range(len(array_of_labeled_csv)):
    data = pd.read_csv(array_of_labeled_csv[product])
    data.drop(['Positive','Negative','Neutral','Compound'],axis=1,inplace=True)
    data.drop(data[(data['Sentiment'] =='Neutral')].index, inplace=True)
    w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
    lemmatizer = nltk.stem.WordNetLemmatizer()
    def lemmatize_text(text):
        st = ""
        for w in w_tokenizer.tokenize(text):
            st = st + lemmatizer.lemmatize(w) + " "
        return st
    data['Review'] = data.Review.apply(lemmatize_text)
    reviews = data['Review'].values
    labels = data['Sentiment'].values
    encoder = LabelEncoder()
    encoded_labels = encoder.fit_transform(labels)
    train_sentences, test_sentences, train_labels, test_labels = train_test_split(reviews, encoded_labels, stratify = encoded_labels)
    vec = CountVectorizer(max_features = 3000)
    X = vec.fit_transform(train_sentences)
    vocab = vec.get_feature_names()
    X = X.toarray()
    word_counts = {}
    for l in range(2):
        word_counts[l] = defaultdict(lambda: 0)
    for i in range(X.shape[0]):
        l = train_labels[i]
        for j in range(len(vocab)):
            word_counts[l][vocab[j]] += X[i][j]
    def laplace_smoothing(n_label_items, vocab, word_counts, word, text_label):
        a = word_counts[text_label][word] + 1
        b = n_label_items[text_label] + len(vocab)
        return math.log(a/b)
    def group_by_label(x, y, labels):
        data = {}
        for l in labels:
            data[l] = x[np.where(y == l)]
        return data
    def fit(x, y, labels):
        n_label_items = {}
        log_label_priors = {}
        n = len(x)
        grouped_data = group_by_label(x, y, labels)
        for l, data in grouped_data.items():
            n_label_items[l] = len(data)
            log_label_priors[l] = math.log(n_label_items[l] / n)
        return n_label_items, log_label_priors
    from nltk.tokenize import sent_tokenize,word_tokenize
    def predict(n_label_items, vocab, word_counts, log_label_priors, labels, x):
        result = []
        for text in x:
            label_scores = {l: log_label_priors[l] for l in labels}
            words = set(w_tokenizer.tokenize(text))
            for word in words:
                if word not in vocab: continue
                for l in labels:
                    log_w_given_l = laplace_smoothing(n_label_items, vocab, word_counts, word, l)
                    label_scores[l] += log_w_given_l
            result.append(max(label_scores, key=label_scores.get))
        return result
    labels = [0,1]
    n_label_items, log_label_priors = fit(train_sentences,train_labels,labels)
    pred = predict(n_label_items, vocab, word_counts, log_label_priors, labels, test_sentences)
    #print("Prediction Results")
    #print(pred)
    accuaracy=accuracy_score(test_labels,pred)*100
    productname=array_of_labeled_csv[product]
   # print(productname)
    
    #print("Accuracy : ", accuaracy,"%")
    f = open("Accuracies.txt", "a")
    f.write("\n" + productname+ " " + str(accuaracy)+"\n")
    f.close()

In [1]:
import pandas as pd
import re
import string
import csv
from nltk.sentiment.vader import SentimentIntensityAnalyzer
array_of_csv=["lenovo x1.csv","lenovo e14.csv"
                     ]
for product in range(len(array_of_csv)):
    df = pd.read_csv(array_of_csv[product],sep='delimiter', header=None)
    df.columns =['Review']
    df = df.dropna()
    df['Review'] = df['Review'].str.replace(r'[^\w\s]+', '')
    df.astype(str).apply(lambda x: x.str.encode('ascii', 'ignore').str.decode('ascii'))

    sentiments = SentimentIntensityAnalyzer()
    df["Positive"] = [sentiments.polarity_scores(i)["pos"] for i in df["Review"]]
    df["Negative"] = [sentiments.polarity_scores(i)["neg"] for i in df["Review"]]
    df["Neutral"] = [sentiments.polarity_scores(i)["neu"] for i in df["Review"]]
    df['Compound'] = [sentiments.polarity_scores(i)["compound"] for i in df["Review"]]
    score = df["Compound"].values
    sentiment = []
    for i in score:
        if i >= 0.05 :
            sentiment.append('Positive')
        elif i <= -0.05 :
            sentiment.append('Negative')
        else:
            sentiment.append('Neutral')
    df["Sentiment"] = sentiment
    df.to_csv('labeled_'+ array_of_csv[product],index=False)
    print(df["Sentiment"].value_counts())

  df = pd.read_csv(array_of_csv[product],sep='delimiter', header=None)
  df['Review'] = df['Review'].str.replace(r'[^\w\s]+', '')


Neutral     1254
Positive     532
Negative     130
Name: Sentiment, dtype: int64


  df = pd.read_csv(array_of_csv[product],sep='delimiter', header=None)
  df['Review'] = df['Review'].str.replace(r'[^\w\s]+', '')


Neutral     3493
Positive    2454
Negative     743
Name: Sentiment, dtype: int64
