<a href="https://colab.research.google.com/github/Wnjoki/Data-Science-Exercises/blob/main/Word2Vectors.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Word2Vector


In [2]:
from traitlets.traitlets import Integer
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# Reading the data
url = "https://raw.githubusercontent.com/Wnjoki/Data-Science-Exercises/main/Reviews.csv" 
ReviewData=pd.read_csv(url)

# Printing number of rows and columns
print(ReviewData.shape)

# Printing sample rows

ReviewData.head()


(21, 2)


Unnamed: 0,Remarks,Rated
0,"We came there after long trip, our room was no...",5
1,Overall the stay was absolutely fantastic. The...,5
2,The reception needs to have better communicati...,2
3,My family and I had an excellent vacation. The...,4
4,Sarova Lion Hill is a great place to be especi...,5


In [5]:
#Count Vectorization: converting text data to numeric
# Count vectorization of text
from sklearn.feature_extraction.text import CountVectorizer

# Review Data
corpus = ReviewData[' Remarks'].values

# Creating the vectorizer
vectorizer = CountVectorizer(stop_words='english')

# Converting the text to numeric data
X = vectorizer.fit_transform(corpus)


# Preparing Data frame For machine learning
# Priority column acts as a target variable and other columns as predictors
CountVectorizedData=pd.DataFrame(X.toarray(),columns=vectorizer.get_feature_names_out())

CountVectorizedData['Priority']=ReviewData['Rated']
print(CountVectorizedData.shape)
CountVectorizedData.head()

(21, 422)


Unnamed: 0,15m,20,2004,absolutely,accommodative,activities,agreed,amazing,amenities,angela,...,wet,willing,withdrew,wonderful,world,year,years,york,yummy,Priority
0,0,0,0,0,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,5
1,0,0,0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,5
2,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,2
3,0,0,0,0,0,0,0,2,0,0,...,0,0,0,1,0,0,0,0,1,4
4,0,0,0,0,0,1,0,1,0,0,...,1,0,0,1,0,0,0,0,0,5


In [8]:
import gensim.downloader as api
path = api.load("word2vec-google-news-300", return_path=True)
print(path)

/root/gensim-data/word2vec-google-news-300/word2vec-google-news-300.gz


In [6]:
import gensim

In [9]:
GoogleModel = gensim.models.KeyedVectors.load_word2vec_format('/root/gensim-data/word2vec-google-news-300/word2vec-google-news-300.gz',binary=True)

In [10]:
# Creating the list of words which are present in the Document term matrix
WordsVocab=CountVectorizedData.columns[:-1]

# Printing sample words
WordsVocab[0:10]

Index(['15m', '20', '2004', 'absolutely', 'accommodative', 'activities',
       'agreed', 'amazing', 'amenities', 'angela'],
      dtype='object')

In [None]:
#Converting every sentence to a numeric vector

In [13]:
# Defining a function which takes text input and returns one vector for each sentence
def FunctionText2Vec(inpTextData):
    # Converting the text to numeric data
    X = vectorizer.transform(inpTextData)
    CountVecData=pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
    
    # Creating empty dataframe to hold sentences
    W2Vectorized_Data=pd.DataFrame()
    
    # Looping through each row for the data
    for i in range(CountVecData.shape[0]):

        # initiating a sentence with all zeros
        Sentence = np.zeros(300)

        # Looping thru each word in the sentence and if its present in 
        # the Word2Vec model then storing its vector
        for word in WordsVocab[CountVecData.iloc[i , :]>=1]:
            #print(word)
            if word in GoogleModel.key_to_index.keys():    
                Sentence=Sentence+GoogleModel[word]
        # Appending the sentence to the dataframe
        W2Vectorized_Data=W2Vectorized_Data.append(pd.DataFrame([Sentence]))
    return(W2Vectorized_Data)




In [20]:
# Calling the function to convert all the text data to Word2Vec Vectors
W2Vectorized_Data=FunctionText2Vec(ReviewData[' Remarks'])

# Checking the new representation for sentences
W2Vectorized_Data.shape


(21, 300)

In [21]:
# Comparing the above with the document term matrix
CountVectorizedData.shape

(21, 422)

In [22]:
# Adding the target variable
W2Vectorized_Data.reset_index(inplace=True, drop=True)
W2Vectorized_Data['Priority']=CountVectorizedData['Priority']
 
# Assigning to DataForML variable
DataForML=W2Vectorized_Data
DataForML.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,291,292,293,294,295,296,297,298,299,Priority
0,0.529877,3.239471,-0.37915,1.614044,-0.756713,-0.675507,2.520691,-4.141632,1.584503,1.712891,...,1.087219,-1.62439,1.080994,-0.737869,-1.442684,0.537651,-1.323044,0.231056,-0.334259,5
1,-0.600555,-0.452728,-0.437073,1.315857,-0.386292,0.640381,0.426559,-2.39502,0.903564,1.460815,...,1.06073,-1.166321,1.315598,-0.098389,-1.314862,0.169189,-0.893814,0.661682,0.056274,5
2,0.595642,1.098122,0.435883,-0.044922,-1.729797,-0.273499,1.323853,-0.905151,1.119141,1.315407,...,1.584747,-1.253296,0.772301,-0.874268,-0.866882,-0.790558,-2.202759,0.846622,0.32373,2
3,-0.787994,1.716743,-0.49469,3.949081,-0.417206,0.738953,1.802982,-3.606323,-0.136078,1.728027,...,0.650673,-5.128418,1.229686,-0.146019,-0.733063,1.529938,-1.943909,1.387665,0.071991,4
4,-0.74942,3.461525,0.424088,5.560516,-1.320006,-1.427124,2.296112,-2.501915,-0.003929,3.802017,...,3.223572,-5.642334,-0.626343,-0.174942,-1.130066,2.113075,-1.745575,0.23111,-0.586029,5


In [23]:
# Separate Target Variable and Predictor Variables
TargetVariable=DataForML.columns[-1]
Predictors=DataForML.columns[:-1]

X=DataForML[Predictors].values
y=DataForML[TargetVariable].values

# Split the data into training and testing set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=428)

# Sanity check for the sampled data
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(14, 300)
(14,)
(7, 300)
(7,)


In [24]:
#Standardization/Normalization

from sklearn.preprocessing import StandardScaler, MinMaxScaler
PredictorScaler=MinMaxScaler()

# fit the data
PredictorScalerFit=PredictorScaler.fit(X)

# Generating the standardized values of X
X=PredictorScalerFit.transform(X)

# Split the data into training and testing set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Sanity check for the sampled data
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(14, 300)
(14,)
(7, 300)
(7,)


# **Training ML classification models**

In [25]:
# Naive Bayes
from sklearn.naive_bayes import GaussianNB, MultinomialNB

# GaussianNB is used in Binomial Classification
# MultinomialNB is used in multi-class classification
#clf = GaussianNB()
clf = MultinomialNB()

# Printing all the parameters of Naive Bayes
print(clf)

NB=clf.fit(X_train,y_train)
prediction=NB.predict(X_test)

# Measuring accuracy on Testing Data
from sklearn import metrics
print(metrics.classification_report(y_test, prediction))
print(metrics.confusion_matrix(y_test, prediction))

# Printing the Overall Accuracy of the model
F1_Score=metrics.f1_score(y_test, prediction, average='weighted')
print('Accuracy of the model on Testing Sample Data:', round(F1_Score,2))

# Importing cross validation function from sklearn
from sklearn.model_selection import cross_val_score

# Running 10-Fold Cross validation on a given algorithm
# Passing full data X and y because the K-fold will split the data and automatically choose train/test
Accuracy_Values=cross_val_score(NB, X , y, cv=5, scoring='f1_weighted')
print('\nAccuracy values for 5-fold Cross Validation:\n',Accuracy_Values)
print('\nFinal Average Accuracy of the model:', round(Accuracy_Values.mean(),2))

MultinomialNB()
              precision    recall  f1-score   support

           1       0.00      0.00      0.00         0
           2       1.00      1.00      1.00         1
           3       0.00      0.00      0.00         1
           4       0.25      1.00      0.40         1
           5       0.00      0.00      0.00         4

    accuracy                           0.29         7
   macro avg       0.25      0.40      0.28         7
weighted avg       0.18      0.29      0.20         7

[[0 0 0 0 0]
 [0 1 0 0 0]
 [0 0 0 0 1]
 [0 0 0 1 0]
 [1 0 0 3 0]]
Accuracy of the model on Testing Sample Data: 0.2

Accuracy values for 5-fold Cross Validation:
 [0.16       0.2        0.33333333 0.83333333 0.25      ]

Final Average Accuracy of the model: 0.36


In [28]:
# K-Nearest Neighbor(KNN)
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier(n_neighbors=14)

# Printing all the parameters of KNN
print(clf)

# Creating the model on Training Data
KNN=clf.fit(X_train,y_train)
prediction=KNN.predict(X_test)

# Measuring accuracy on Testing Data
from sklearn import metrics
print(metrics.classification_report(y_test, prediction))
print(metrics.confusion_matrix(y_test, prediction))

# Printing the Overall Accuracy of the model
F1_Score=metrics.f1_score(y_test, prediction, average='weighted')
print('Accuracy of the model on Testing Sample Data:', round(F1_Score,2))

# Importing cross validation function from sklearn
from sklearn.model_selection import cross_val_score

Accuracy_Values=cross_val_score(KNN, X , y, cv=3, scoring='f1_weighted')
print('\nAccuracy values for  Cross Validation:\n',Accuracy_Values)
print('\nFinal Average Accuracy of the model:', round(Accuracy_Values.mean(),2))


KNeighborsClassifier(n_neighbors=14)
              precision    recall  f1-score   support

           2       0.00      0.00      0.00         1
           3       0.00      0.00      0.00         1
           4       0.00      0.00      0.00         1
           5       0.57      1.00      0.73         4

    accuracy                           0.57         7
   macro avg       0.14      0.25      0.18         7
weighted avg       0.33      0.57      0.42         7

[[0 0 0 1]
 [0 0 0 1]
 [0 0 0 1]
 [0 0 0 4]]
Accuracy of the model on Testing Sample Data: 0.42

Accuracy values for  Cross Validation:
 [0.25714286 0.25714286 0.25714286]

Final Average Accuracy of the model: 0.26


In [29]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression

# choose different values for solver 
clf = LogisticRegression(C=1,penalty='l2', solver='newton-cg')


# Creating the model on Training Data
LOG=clf.fit(X_train,y_train)

# Generating predictions on testing data
prediction=LOG.predict(X_test)
# Printing sample values of prediction in Testing data
TestingData=pd.DataFrame(data=X_test, columns=Predictors)
TestingData['Survived']=y_test
TestingData['Predicted_Survived']=prediction
print(TestingData.head())


# Measuring accuracy on Testing Data
from sklearn import metrics
print(metrics.classification_report(y_test, prediction))
print(metrics.confusion_matrix(prediction, y_test))

## Printing the Overall Accuracy of the model
F1_Score=metrics.f1_score(y_test, prediction, average='weighted')
print('Accuracy of the model on Testing Sample Data:', round(F1_Score,2))



          0         1         2         3         4         5         6  \
0  0.647735  0.790893  0.588076  0.237424  0.863077  0.334057  0.479814   
1  0.412831  0.535872  0.500265  0.110248  0.755636  0.601177  0.318230   
2  1.000000  0.575645  0.000000  0.871228  0.838195  0.493849  0.214496   
3  0.122428  0.000000  0.569096  0.194748  0.913303  0.918904  0.151738   
4  0.417216  0.262783  1.000000  0.105939  0.948102  0.184030  0.083859   

          7         8         9  ...       292       293       294       295  \
0  0.202339  0.228411  0.316487  ...  0.725738  0.348319  0.393356  0.319104   
1  0.660727  0.477545  0.544820  ...  0.661021  0.655372  0.000000  0.674567   
2  0.107613  0.535932  1.000000  ...  0.436415  0.336496  0.262473  0.000000   
3  0.500466  0.143486  0.256369  ...  0.786965  0.396181  0.570732  0.353399   
4  0.857138  0.146059  0.077440  ...  0.855616  0.154853  0.704155  0.790191   

        296       297       298       299  Survived  Predicted_Survi

In [31]:
# Decision Trees
from sklearn import tree
clf = tree.DecisionTreeClassifier(max_depth=10,criterion='gini')

# Printing all the parameters of Decision Trees
print(clf)

# Creating the model on Training Data
DTree=clf.fit(X_train,y_train)
prediction=DTree.predict(X_test)

# Measuring accuracy on Testing Data
from sklearn import metrics
print(metrics.classification_report(y_test, prediction))
print(metrics.confusion_matrix(y_test, prediction))

# Printing the Overall Accuracy of the model
F1_Score=metrics.f1_score(y_test, prediction, average='weighted')
print('Accuracy of the model on Testing Sample Data:', round(F1_Score,2))


# cross validation
from sklearn.model_selection import cross_val_score
 
# Running 10-Fold Cross validation on a given algorithm

Accuracy_Values=cross_val_score(DTree, X , y, cv=3, scoring='f1_weighted')
print('\nAccuracy values for Cross Validation:\n',Accuracy_Values)
print('\nFinal Average Accuracy of the model:', round(Accuracy_Values.mean(),2))




DecisionTreeClassifier(max_depth=10)
              precision    recall  f1-score   support

           1       0.00      0.00      0.00         0
           2       0.00      0.00      0.00         1
           3       0.00      0.00      0.00         1
           4       0.00      0.00      0.00         1
           5       1.00      0.25      0.40         4

    accuracy                           0.14         7
   macro avg       0.20      0.05      0.08         7
weighted avg       0.57      0.14      0.23         7

[[0 0 0 0 0]
 [0 0 0 1 0]
 [0 1 0 0 0]
 [1 0 0 0 0]
 [1 0 0 2 1]]
Accuracy of the model on Testing Sample Data: 0.23

Accuracy values for Cross Validation:
 [0.4        0.28571429 0.12857143]

Final Average Accuracy of the model: 0.27


In [32]:
# Adaboost 
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

# Choosing Decision Tree with 1 level as the weak learner
DTC=DecisionTreeClassifier(max_depth=2)
clf = AdaBoostClassifier(n_estimators=10, base_estimator=DTC ,learning_rate=0.01)

# Printing all the parameters of Adaboost
print(clf)

# Creating the model on Training Data
AB=clf.fit(X_train,y_train)
prediction=AB.predict(X_test)

# Measuring accuracy on Testing Data
from sklearn import metrics
print(metrics.classification_report(y_test, prediction))
print(metrics.confusion_matrix(y_test, prediction))

# Printing the Overall Accuracy of the model
F1_Score=metrics.f1_score(y_test, prediction, average='weighted')
print('Accuracy of the model on Testing Sample Data:', round(F1_Score,2))

#  cross validation
from sklearn.model_selection import cross_val_score
 
# Running 10-Fold Cross validation on a given algorithm

Accuracy_Values=cross_val_score(AB, X , y, cv=3, scoring='f1_weighted')
print('\nAccuracy values for  Cross Validation:\n',Accuracy_Values)
print('\nFinal Average Accuracy of the model:', round(Accuracy_Values.mean(),2))
 
 

AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=2),
                   learning_rate=0.01, n_estimators=10)
              precision    recall  f1-score   support

           1       0.00      0.00      0.00         0
           2       0.33      1.00      0.50         1
           3       0.00      0.00      0.00         1
           4       0.00      0.00      0.00         1
           5       1.00      0.25      0.40         4

    accuracy                           0.29         7
   macro avg       0.27      0.25      0.18         7
weighted avg       0.62      0.29      0.30         7

[[0 0 0 0 0]
 [0 1 0 0 0]
 [0 1 0 0 0]
 [0 1 0 0 0]
 [1 0 0 2 1]]
Accuracy of the model on Testing Sample Data: 0.3

Accuracy values for  Cross Validation:
 [0.31428571 0.24489796 0.2952381 ]

Final Average Accuracy of the model: 0.28
