In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('C:\\Users\\mesho\\OneDrive\\Desktop\\SentiSum\\sentisum-assessment-dataset.csv',header=None)
df

In [None]:
df=df.dropna(axis=1)
df

In [None]:
print(df.shape)

In [None]:
### The top most entry of the data
df.loc[1]

In [None]:
### the top 5 entries of the dataset 
df.head()

In [None]:
df.describe(include=[object])

In [None]:
import nltk
from nltk.corpus import stopwords  #stopwords
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
stop_words=set(nltk.corpus.stopwords.words('english'))

In [None]:
stemmer = SnowballStemmer("english") #snowball stemmer
original_words = ['alumnus','universal', 'waited', 'Flying', 'caring', 'flies', 'dies', 'agreed', 'owned', 
           'humbled', 'sized','meeting', 'state', 'siezing', 'itemization','sensational', 
           'traditionally', 'referencing', 'colonizer','plotted','providing'] 
plural = [stemmer.stem(plural) for plural in original_words] #Stemmed into plural form

pd.DataFrame(data={'original word':original_words, 'stemmed':plural})

In [None]:
print(WordNetLemmatizer().lemmatize('working', pos = 'v')) 
# past tense to present tense

In [None]:
def clean_text(text):
      text = re.sub("@[A-Za-z0-9]+", '',text)
      le=WordNetLemmatizer()
      word_tokens=word_tokenize(text)
      tokens=[le.lemmatize(w) for w in word_tokens if w not in stop_words and len(w)>3]
      cleaned_text=" ".join(tokens)
      return cleaned_text

In [None]:
df['cleaned_text']=df[0].apply(clean_text)
df

In [None]:
vect =TfidfVectorizer(stop_words=stop_words,max_features=1000)
vect_text=vect.fit_transform(df['cleaned_text'])

In [None]:
from sklearn.decomposition import LatentDirichletAllocation
lda_model=LatentDirichletAllocation(n_components=12,
learning_method='online',random_state=42,max_iter=1) 
lda_top=lda_model.fit_transform(vect_text)

In [None]:
vocab = vect.get_feature_names()
for i, comp in enumerate(lda_model.components_):
     vocab_comp = zip(vocab, comp)
     sorted_words = sorted(vocab_comp, key= lambda x:x[1], reverse=True)[:12]
     print("Topic " +str(i)+": ")
     for t in sorted_words:
            print(t[0],end=" ") 

In [None]:
for i,topic in enumerate(lda_top[0:]):
  print("Document ",i,": ",topic*100,"%")

In [None]:
lda_top=lda_top*100
lda_top

In [None]:
lda_top = lda_top.astype(int)
lda_top

In [None]:
maxvalues = np.amax(lda_top, axis=1)
maxvalues

In [None]:
len(maxvalues)

In [None]:
indexs = np.argmax(lda_top, axis=1)
indexs

As we can see here these indexs are clusters which they belong to as these values are the probabilities where they actually belong to. 

Our next goal would be to quantify the clusters (if many clusters have nearly same score then it would be difficult to handle many categorical values at the same time). Though this approach may/may not be the correct one but to seems appropriate as we can merely not choose sole on the basis of high probability.

So we'll be using some assumptions and use of bias here to annoatate the dataset and then train subtask b.

Now let's first check our dataset for assumption 1 that is values > 0.5 or 50

In [None]:
greater_than_50 = (maxvalues > 50)
greater_than_50

In [None]:
res_50 = [i for i, val in enumerate(greater_than_50) if not val]

In [None]:
len(res_50)

So as per our dataset we can clearly say that close to 5,047 samples have values > 50 (percentage) which means our model was smart enough to classify it on the basis of the features.

In [None]:
#less than 50 and greater than 40
l_50_g_40 = (maxvalues > 40) & (maxvalues < 50)
l_50_g_40

In [None]:
resl_50_g_40 = [i for i, val in enumerate(l_50_g_40) if not val]

In [None]:
len(resl_50_g_40)

So as per our dataset we can clearly say that close to 2,161 samples have values greater than 40 and less than 50 (percentage) which means our model was smart enough to classify it on the basis of the features.

In [None]:
#less than 40 and greater than 30
l_40_g_30 = (maxvalues > 30) & (maxvalues < 40)
l_40_g_30

In [None]:
resl_40_g_30 = [i for i, val in enumerate(l_40_g_30) if not val]

In [None]:
len(resl_40_g_30)

So as per our dataset we can clearly say that close to 1823 samples have values greater than 30 and less than 40 (percentage) which means our model was smart enough to classify it on the basis of the features.

So we'll add the indexes that we calculated as theose were the clusters only, so adding it to the dataset

In [None]:
df['index'] = indexs
df['high_val'] = maxvalues
df

Now that we have the data anootation part done we'll start with training the supervised machine learning algorithm.

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['cleaned_text'], df.index, test_size=0.10)

In [None]:
## Buidlding Count Vectorizer to convert the Messsage to Vectors
vect_df = TfidfVectorizer()

In [None]:
X_train_text = vect_df.fit_transform(X_train)
X_train_text

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
#Algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB

In [None]:
# model_params = {
#     'svm': {
#         'model': svm.SVC(gamma='auto'),
#         'params' : {
#             'C': [1,10,20],
#             'kernel': ['rbf','linear']
#         }  
#     },
#     'random_forest': {
#         'model': RandomForestClassifier(),
#         'params' : {
#             'n_estimators': [1,5,10]
#         }
#     },
#     'logistic_regression' : {
#         'model': LogisticRegression(solver='liblinear',multi_class='auto'),
#         'params': {
#             'C': [1,5,10,15]
#         }
#     },
#     'decision_tree': {
#         'model': DecisionTreeClassifier(),
#         'params' : {
#             'criterion': ['gini', 'entropy'],
#             'splitter': ['best','random']
#         }  
#     },
#     'knn': {
#         'model': KNeighborsClassifier(),
#         'params' : {
#             'n_neighbors': [5,7,9,11],
#             'algorithm' : ['ball_tree', 'kd_tree', 'brute']
#         }
#     },
#     'naive_bayes' : {
#         'model': GaussianNB(),
#         'params': {
#         }
#     }
# }

In [None]:
# scores = []

# for model_name, mp in model_params.items():
#     clf =  GridSearchCV(mp['model'], mp['params'], cv=2, return_train_score=False)
#     clf.fit(X_train_text, df.index)
#     scores.append({
#         'model': model_name,
#         'best_score': clf.best_score_,
#         'best_params': clf.best_params_
#     })
    
# vals = pd.DataFrame(scores,columns=['model','best_score','best_params'])
# vals

In [None]:
mnb = MultinomialNB()
lr = LogisticRegression()
dt = DecisionTreeClassifier()
rfc = RandomForestClassifier()
svm = SVC()
knn = KNeighborsClassifier()

In [None]:
# from sklearn.pipeline import Pipeline
# clf = Pipeline([
#     ('vectorizer', TfidfVectorizer()),
#     ('nb', MultinomialNB()),
#     ('lr', LogisticRegression()),
#     ('dt', DecisionTreeClassifier()),
#     ('rfc',RandomForestClassifier()),
#     ('svm',SVC()),
#     ('knn',KNeighborsClassifier())
# ])

In [None]:
mnb.fit(X_train_text,y_train)

In [None]:
lr.fit(X_train_text,y_train)

In [None]:
dt.fit(X_train_text,y_train)

In [None]:
rfc.fit(X_train_text,y_train)

In [None]:
svm.fit(X_train_text,y_train)

In [None]:
knn.fit(X_train_text,y_train)

In [None]:
mnb.score(X_train_text,y_train)
lr.score(X_train_text,y_train)
dt.score(X_train_text,y_train)
rfc.score(X_train_text,y_train)
svm.score(X_train_text,y_train)
knn.score(X_train_text,y_train)

In [None]:
emails = ['A perfectly easy way to order tyres online. Just enter your car registration number, check the recommended tyres are correct and select the tyres you want. Select the best time and venue for the fitting and pay online. The whole process is easy and the best value for money!'
]
emails_count = vect_df.transform(emails)
model.predict(emails_count)

In [None]:
model.score(X_train_text,y_train)