# TF-IDF and Classification

In [None]:
from sklearn.model_selection import train_test_split
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
import numpy as np

## 1. Data Import

In [None]:
df = pd.read_csv("Corona_NLP.csv", encoding='latin-1')
pd.options.display.max_colwidth = 500
df.head(10)

In [None]:
df  = df[['OriginalTweet', 'Sentiment']].head(5000)

In [None]:
set(df['Sentiment'].values)

In [None]:
df['LabelSentiment'] = df['Sentiment'].apply(lambda x: 1 if x in ['Extremely Positive', 'Positive', 'Neutral'] else 0)

In [None]:
df.head()

In [None]:
df['LabelSentiment'].hist()

In [None]:
all_words = ' '.join(df['OriginalTweet']).split()
freq = pd.Series(all_words).value_counts()
freq.sort_values(ascending=False)[0:10]

## 2. Train - Test split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['OriginalTweet'], df['LabelSentiment'], test_size=0.20, random_state=4)

In [None]:
len(X_train), len(X_test), len(y_train), len(y_test)

## 3. Clean Text

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')
stop = set(stopwords.words('english'))
stemmer = SnowballStemmer('english')
lemma = WordNetLemmatizer()

In [None]:
from tqdm import tqdm
def clean(text_list):

    updates = []

    for j in tqdm(text_list):

        text = j

        #LOWERCASE TEXT
        text = text.lower()

        #REMOVE NUMERICAL DATA and PUNCTUATION
        text = re.sub("[^a-zA-Z]"," ", text )

        #REMOVE STOPWORDS
        text = " ".join([word for word in text.split() if word not in stop])

        #Lemmatize
        text = " ".join(lemma.lemmatize(word) for word in text.split())

        updates.append(text)

    return updates

In [None]:
X_train_clean = clean(X_train)

In [None]:
X_test_clean = clean(X_test)

## 4. Feature Engineering

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
TfidfVectorizer?

In [None]:
tfidf = TfidfVectorizer(max_df=0.8, ngram_range=(1,1))

### Train

In [None]:
X = tfidf.fit_transform(X_train_clean).toarray()

In [None]:
tfidf.get_feature_names_out()

In [None]:
#Get a dictionary that maps each word (token) to its index (position) in the vectorized feature space.
tfidf.vocabulary_

In [None]:
#Get the IDF (Inverse Document Frequency) score for each feature (term) in the same order as the vocabulary.
idf_dict = dict(zip(tfidf.get_feature_names_out(), tfidf.idf_))

# Get the first 10 items
idf_dict_10 = dict(list(idf_dict.items())[4100:4110])

print(idf_dict_10)

In [None]:
X.shape

In [None]:
X

In [None]:
y = y_train.to_numpy()

In [None]:
y

### Test

In [None]:
x_test = tfidf.transform(X_test_clean).toarray()

In [None]:
y_test = y_test.to_numpy()

## 5. Train Models and Evaluate

## Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

In [None]:
nb = GaussianNB()

In [None]:
nb.fit(X, y)

#### Predict

In [None]:
nb_pred = nb.predict(x_test)

In [None]:
from sklearn.metrics import classification_report

In [None]:
print(classification_report(y_test, nb_pred))

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(nb_pred, y_test)

The confusion matrix in sklearn is presented in the following format: <br>
    [ [ TN  FP  ] <br>
    [ FN  TP ] ]

#### Attributes

* Check the probability of each class.

In [None]:
nb.class_prior_

* Verify the number of training samples observed in each class.

In [None]:
nb.class_count_

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
LogisticRegression?

In [None]:
lr = LogisticRegression(random_state=0)

In [None]:
lr.fit(X,y)

In [None]:
lr.classes_

In [None]:
lr.coef_

In [None]:
lr.intercept_

In [None]:
lr_pred = lr.predict(x_test)

In [None]:
lr_pred

In [None]:
lr.predict_proba(x_test)

In [None]:
print(classification_report(y_test, lr_pred))

In [None]:
confusion_matrix(lr_pred, y_test)

## Multilayer Perceptron - ANN

__The number of hidden layers__<br>
-	Increasing the number of hidden layers might or might not improve the accuracy, it depends on the complexity of the problem
-	Increasing the number of hidden layers more than needed will cause overfit on the training set and a decrease in the accuracy value for the test set

__The number of hidden units__ <br>
-	Using too few neurons in the hidden layers will result in underfitting
-	Using too many neurons in the hidden layer may result in overfitting and increases the training time of the neural network

The aim is to keep a good trade-off between the simplicity of the model and the performance accuracy! <br>

__Different rules of thumb exist (take them with a grain of salt):__
-	_The number of hidden neurons should be between the size of the input layer and the size of the output layer_
-	_The number of hidden neurons should be 2/3 the size of the input layer, plus the size of the output layer_
-	_The number of hidden neurons should be less than twice the size of the input layer_



In [None]:
from sklearn.neural_network import MLPClassifier

In [None]:
MLPClassifier?

In [None]:
mlp = MLPClassifier(solver='adam', hidden_layer_sizes=(2,2), activation='logistic', random_state=1)

In [None]:
mlp.fit(X,y)

In [None]:
mlp_pred = mlp.predict(x_test)

In [None]:
print(classification_report(y_test, mlp_pred))

In [None]:
confusion_matrix(mlp_pred, y_test)

### Attributes

### Activation Function

In [None]:
model_relu = MLPClassifier(activation = 'relu')

- __Advantages:__
     - Computationally efficient - allows the network to converge very quickly.
- __Disadvantages:__
     - The dying ReLU problem - When inputs approach zero, or are negative, the gradient of the function becomes zero and the network cannot perform backpropagation and cannot learn.

In [None]:
model_logistic = MLPClassifier(activation = 'logistic')

 - __Advantages:__
     - Smooth gradient, preventing “jumps” in output values.
     - Output values bound between 0 and 1, normalizing the output of each neuron.
 - __Disadvantages:__
     - Vanishing gradient—for very high or very low values of X, there is almost no change to the prediction, causing a vanishing gradient problem. This can result in the network refusing to learn further, or have slow convergence.
     - Computationally expensive.

In [None]:
model_tanh = MLPClassifier(activation = 'tanh')

 - __Advantages:__
     - Zero centered - making it easier to model inputs that have strongly negative, neutral and strongly positive values. Other than that it is similar to the sigmoid function. <br>
 - __Disadvantages:__
     - Same as with the sigmoid function

### Solver

In [None]:
model_sgd = MLPClassifier(solver = 'sgd')

__Notes__
- While Gradient Descent use the whole training data to do a single update, in SGD a random data point of the training data to update the parameters - SGD is faster than GD.
- It uses a common learning rate for all parameters, contrarialy to what happens in Adam.

In [None]:
model_adam = MLPClassifier(solver = 'adam')

__When to use__ <br>
- It achieves good results fast - good for complex models, if processing time is an issue.

__Notes__ <br>
- It computes individual adaptive learning rates for different parameters
- Adam combines the advantages of RMSProp and AdaGrad <br>

### Other Parameters

|Parameter| Definition | LBFGS | SGD | ADAM |
|---|---|---|---|---|
|alpha| L2 penalty (regularization term) parameter | yes | yes | yes |
| power_t | The exponent for inverse scaling learning rate. It is used in updating effective learning rate when the learning_rate is set to ‘invscaling’. | no | yes | no |
| shuffle | Whether to shuffle samples in each iteration. | no | yes | yes |
| tol | Tolerance for the optimization. When the loss or score is not improving by at least tol for n_iter_no_change consecutive iterations, unless learning_rate is set to ‘adaptive’, convergence is considered to be reached and training stops. | yes | yes | yes |
| warm_start | When set to True, reuse the solution of the previous call to fit as initialization, otherwise, just erase the previous solution. | yes | yes | yes |
| momentum | Momentum for gradient descent update. Should be between 0 and 1. | no | yes | no |
| nesterovs_momentum | Whether to use Nesterov’s momentum.| no | yes | no |
| early stopping | Whether to use early stopping to terminate training when validation score is not improving. If set to true, it will automatically set aside 10% of training data as validation and terminate training when validation score is not improving by at least tol for n_iter_no_change consecutive epochs. The split is stratified, except in a multilabel setting.  | no | yes | yes |
| validation_fraction | The proportion of training data to set aside as validation set for early stopping. Must be between 0 and 1. Only used if early_stopping is True | no | yes | yes|
| beta1 | Exponential decay rate for estimates of first moment vector in adam, should be in [0, 1). | no | no | yes |
| beta2 | Exponential decay rate for estimates of second moment vector in adam, should be in [0, 1).  | no | no | yes |
| epsilon | Value for numerical stability in adam. | no | no | yes |
| n_iter_no_change | Maximum number of epochs to not meet tol improvement. |  no | yes | yes |
| max_fun | Only used when solver=’lbfgs’. Maximum number of loss function calls. The solver iterates until convergence (determined by ‘tol’), number of iterations reaches max_iter, or this number of loss function calls. | yes | no | no |

### Grid Search

In [None]:

from sklearn.model_selection import GridSearchCV

In [None]:
parameter_space = {
    'hidden_layer_sizes': [(3,4,5), (10,)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'learning_rate_init': [0.0001, 0.001, 0.01, 0.1]
}

In [None]:
model = MLPClassifier()

In [None]:
clf = GridSearchCV(model, parameter_space)

In [None]:
clf.fit(X, y)

In [None]:
clf.best_params_

In [None]:
final_model = clf.best_estimator_.fit(X, y)
print('Train:', final_model.score(X, y))
print('Test:', final_model.score(X_test, y_test))

In [None]:
# Best parameter set
print('------------------------------------------------------------------------------------------------------------------------')
print('Best parameters found:\n', clf.best_params_)
print('------------------------------------------------------------------------------------------------------------------------')

# All results
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r" % (mean, std , params))