<a href="https://colab.research.google.com/github/Pavanvarma007/-Pavan_INFO5731_Spring2023/blob/main/INFO5731_Assignment_Four.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **INFO5731 Assignment Four**

In this assignment, you are required to conduct topic modeling, sentiment analysis based on **the dataset you created from assignment three**.

# **Question 1: Topic Modeling**

(30 points). This question is designed to help you develop a feel for the way topic modeling works, the connection to the human meanings of documents. Based on the dataset from assignment three, write a python program to **identify the top 10 topics in the dataset**. Before answering this question, please review the materials in lesson 8, especially the code for LDA, LSA, and BERTopic. The following information should be reported:

(1) Features (text representation) used for topic modeling.

(2) Top 10 clusters for topic modeling.

(3) Summarize and describe the topic for each cluster. 


In [None]:

!pip install pyLDAvis 
!pip install ipympl
%matplotlib ipympl

In [None]:
import matplotlib.pyplot as plt
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import nltk
import pandas as pd

nltk.download('stopwords')
stopwordList = nltk.corpus.stopwords.words('english')
tokenizer = RegexpTokenizer(r'\w+')
stemmer = PorterStemmer()

In [None]:
revw_df = pd.read_csv('Sentiment_reviews.csv',encoding='ISO-8859-1')

In [None]:
revw_df.drop(columns=['Unnamed: 0'], inplace=True)
revw_df

In [None]:
from nltk.corpus import stopwords
from textblob import TextBlob

def lowerCase(revwsList):
  return revwsList.apply(lambda x: " ".join(x.lower() for x in str(x).split()))

def tokenization(revwsList):
  return revwsList.apply(lambda x: tokenizer.tokenize(x))

def stemming(revwsList):
  return revwsList.apply(lambda x: [stemmer.stem(i) for i in x])

In [None]:
from nltk.corpus import stopwords
from textblob import TextBlob

revwsList = revw_df['Preprocessed_Review_Text']
revwsList = lowerCase(revwsList)
revwsList = tokenization(revwsList)
revwsList = stemming(revwsList)

In [None]:
revw_df['Stemming'] = revwsList
revw_df


In [None]:
stemvaluesList = revw_df['Stemming'].values.tolist()

In [None]:
from gensim import corpora, models
from pprint import pprint


def Bigram():
  bigram = models.Phrases(stemvaluesList, min_count=5, threshold=100)
  bigram_mod = models.phrases.Phraser(bigram)
  return bigram, bigram_mod

def Trigram():
  bigram = Bigram()[0]
  trigram = models.Phrases(bigram[stemvaluesList], threshold=100)
  trigram_mod = models.phrases.Phraser(trigram)
  return trigram, trigram_mod

def make_bigrams(texts, bigram_mod):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts, bigram_mod):
  return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

def MakeCorpus(data_lemmatized):
  id2word = corpora.Dictionary(data_lemmatized)
  texts = data_lemmatized
  corpus = [id2word.doc2bow(text) for text in texts]
  return corpus


def docLda(corpus, LDA_Model):
  return LDA_Model[corpus]

In [None]:
bigram = Bigram()
Big_Mod = bigram[1]
triGram = Trigram()
trigram_mod = triGram[1]
print(trigram_mod[bigram])

In [None]:
import spacy

data_words_bigrams = make_bigrams(stemvaluesList, Big_Mod)
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
print(data_lemmatized[:1])

In [None]:
Corpus = MakeCorpus(data_lemmatized)
print(Corpus[:1])

In [None]:
id2word = corpora.Dictionary(data_lemmatized)
print(id2word)

In [None]:
[[(id2word[id], freq) for id, freq in cp] for cp in Corpus[:1]]

In [None]:
LDA_Model = models.ldamodel.LdaModel(corpus=Corpus,
                                           id2word=id2word,
                                           num_topics = 20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [None]:
LDA_Model.print_topics()

In [None]:
docLda = docLda(Corpus, LDA_Model)
docLda

In [None]:
from gensim.models import CoherenceModel
print('\nPerplexity: ', LDA_Model.log_perplexity(Corpus))
coherence_model_lda = CoherenceModel(model=LDA_Model, 
                                     texts=data_lemmatized, 
                                     dictionary=id2word,
                                     coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

In [None]:
lsamodel = models.LsiModel(Corpus,
                           num_topics = 20, 
                           id2word = id2word)


In [None]:
pprint(lsamodel.print_topics(num_topics = 20))

In [None]:
coherence_values = []
model_list = []
for num_topics in range(2, 12, 1):
  model = models.LsiModel(Corpus,
                          num_topics = 20,
                          id2word = id2word)
  model_list.append(model)
  coherencemodel = CoherenceModel(model= model, texts = review_df['Stemming'],
                                  dictionary = id2word,
                                  coherence='c_v')
  coherence_values.append(coherencemodel.get_coherence())

In [None]:
!pip install biterm

In [None]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from biterm.btm import oBTM
from biterm.utility import vec_to_biterms, topic_summuary


In [None]:
biterm = review_df['Preprocessed_Review_Text'].head(100).values
vector = TfidfVectorizer(stop_words='english')
X_tfidf =vector.fit_transform(biterm).toarray()


text = np.array(vector.get_feature_names())
words = vec_to_biterms(X_tfidf)

model = oBTM(num_topics=10, V=text)
model_lda= model.fit_transform(words, iterations=10)


topic_summuary(model.phi_wz.T, X_tfidf, text, 10)

# **Question 2: Sentiment Analysis**

(30 points). Sentiment analysis also known as opinion mining is a sub field within Natural Language Processing (NLP) that builds machine learning algorithms to classify a text according to the sentimental polarities of opinions it contains, e.g., positive, negative, neutral. The purpose of this question is to develop a machine learning classifier for sentiment analysis. Based on the dataset from assignment three, write a python program to implement a sentiment classifier and evaluate its performance. Notice: **80% data for training and 20% data for testing**.  

(1) Features used for sentiment classification and explain why you select these features.

(2) Select two of the supervised learning algorithm from scikit-learn library: https://scikit-learn.org/stable/supervised_learning.html#supervised-learning, to build a sentiment classifier respectively. Note: Cross-validation (5-fold or 10-fold) should be conducted. Here is the reference of cross-validation: https://scikit-learn.org/stable/modules/cross_validation.html.

(3) Compare the performance over accuracy, precision, recall, and F1 score for the two algorithms you selected. Here is the reference of how to calculate these metrics: https://towardsdatascience.com/accuracy-precision-recall-or-f1-331fb37c5cb9. 

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer





In [None]:
review_df = pd.read_csv('Sentiment_reviews.csv')

In [None]:
review_df.drop(columns=['Unnamed: 0'], inplace=True)
review_df

In [None]:
tfidf_vector = TfidfVectorizer()
X_tfidf = tfidf_vector.fit_transform(review_df['Preprocessed_Review_Text'])
print(X_tfidf.shape)

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
mnb = MultinomialNB()
svm = LinearSVC()

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X_tfidf,
                                                    review_df['Sentiment_'], 
                                                    test_size=0.2, 
                                                    random_state=42)
model_mnb = mnb.fit(x_train,y_train)

In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
y_pred_mnb = model_mnb.predict(x_test)
print('Accuracy %s' % accuracy_score(y_pred_mnb,y_test))
print(classification_report(y_test,y_pred_mnb))


In [None]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(mnb, x_test, y_test, cv=7)
print("using MNB",scores.mean())

In [None]:
model_svm = svm.fit(x_train,y_train)
y_pred_svm = model_svm.predict(x_test)


In [None]:
classification_report(y_test,y_pred_svm)

In [None]:
print('Accuracy %s' % accuracy_score(y_pred_svm,y_test))

In [None]:
scores = cross_val_score(svm, x_test, y_test, cv=7)
print("using svm",scores.mean())



# **Question 3: House price prediction**

(40 points). You are required to build a **regression** model to predict the house price with 79 explanatory variables describing (almost) every aspect of residential homes. The purpose of this question is to practice regression analysis, an supervised learning model. The training data, testing data, and data description files can be download from canvas. Here is an axample for implementation: https://towardsdatascience.com/linear-regression-in-python-predict-the-bay-areas-home-price-5c91c8378878. 


In [None]:
import pandas as pd
def getDataframe(filepath):
  return pd.read_csv(filepath)


In [None]:
test = getDataframe('test.csv')
test


In [None]:
train = getDataframe('train.csv')
train

In [None]:
!pip install matplotlib==3.1.3

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
train.hist(bins=50, figsize=(20,15))
#plt.savefig("attribute_histogram_plots")
plt.show()

In [None]:
cor = train.corr()
cor

In [None]:
cor["YrSold"].sort_values(ascending=False)


In [None]:
train.plot(kind="scatter", x="OverallQual", y="SalePrice", alpha=0.5)


In [None]:
train.plot(kind="scatter", x="GrLivArea", y="SalePrice", alpha=0.5)


In [None]:
train.plot(kind="scatter", x="GarageCars", y="SalePrice", alpha=0.5)


In [None]:
train.boxplot(column=['OverallQual', 'GrLivArea', 'GarageCars', 'GarageArea'])


In [None]:
train.boxplot(column=['TotalBsmtSF', '1stFlrSF', 'FullBath', 'SalePrice'])


In [None]:
import numpy as np
train.fillna(-1000, inplace=True)


In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder

from sklearn.preprocessing import LabelEncoder
columns = ('GarageCond',
           'LandContour', 
           'RoofStyle', 
           'RoofMatl', 
           'Heating', 
           'MiscFeature', 
           'SaleType', 
           'GarageType',
           'Electrical', 
           'SaleCondition', 
           'Foundation',
           'Exterior1st', 
           'Exterior2nd',
           'MasVnrType', 
           'FireplaceQu', 
           'LotShape', 
           'Neighborhood', 
           'Condition1', 
           'Condition2', 
           'Utilities', 
           'BldgType', 
           'HouseStyle',
           'PoolQC', 
           'BsmtQual', 
           'BsmtCond', 
           'GarageQual',
           'BsmtExposure', 
           'ExterQual', 
           'ExterCond',
           'HeatingQC', 
           'KitchenQual', 
           'BsmtFinType1',
           'BsmtFinType2', 
           'Functional', 
           'Fence', 
           'GarageFinish', 
           'LandSlope',
           'LotShape', 
           'PavedDrive', 
           'Street',
           'Alley', 
           'CentralAir', 
           'MSSubClass', 
           'OverallCond', 
           'YrSold',
           'MoSold', 
           'MSZoning',
           'LotConfig')


In [None]:
for i in columns:
    encoder = LabelEncoder()
    encoder.fit(list(train[i].values))
    train[i] = encoder.transform(list(train[i].values))

In [None]:
for j in columns:
    encoder_test = LabelEncoder()
    encoder_test.fit(list(test[j].values))
    test[j] = encoder_test.transform(list(test[j].values))

In [None]:
x_train = train[train.columns[:80]]
y_train = train['YrSold']
x_validation = test[test.columns[:80]]

In [None]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(x_train, y_train)

In [None]:

import numpy as np
train.replace([np.inf, -np.inf], np.nan, inplace=True)
train.fillna(-1000, inplace=True)

In [None]:
predicted = regressor.predict(x_train)

In [None]:
predictedValues = pd.DataFrame({"Predicted Values": predicted})

In [None]:
predictedValues