In [25]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## 1.Installing Libraries 

In [1]:
# Download packages
!pip install plotly
!pip install stopwordsiso
!pip install yellowbrick
!pip install imblearn

Collecting plotly
  Downloading plotly-5.3.1-py2.py3-none-any.whl (23.9 MB)
Collecting tenacity>=6.2.0
  Downloading tenacity-8.0.1-py3-none-any.whl (24 kB)
Installing collected packages: tenacity, plotly
Successfully installed plotly-5.3.1 tenacity-8.0.1
Collecting stopwordsiso
  Downloading stopwordsiso-0.6.1-py3-none-any.whl (73 kB)
Installing collected packages: stopwordsiso
Successfully installed stopwordsiso-0.6.1
Collecting yellowbrick
  Downloading yellowbrick-1.3.post1-py3-none-any.whl (271 kB)
Installing collected packages: yellowbrick
Successfully installed yellowbrick-1.3.post1
Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Collecting imbalanced-learn
  Downloading imbalanced_learn-0.8.1-py3-none-any.whl (189 kB)
Collecting scikit-learn>=0.24
  Downloading scikit_learn-1.0-cp38-cp38-win_amd64.whl (7.2 MB)
Installing collected packages: scikit-learn, imbalanced-learn, imblearn
  Attempting uninstall: scikit-learn
    Found existing installation:

In [4]:
!pip install wordcloud

Collecting wordcloud
  Downloading wordcloud-1.8.1-cp38-cp38-win_amd64.whl (155 kB)
Installing collected packages: wordcloud
Successfully installed wordcloud-1.8.1


In [5]:
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
from plotly import graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff
from yellowbrick.text import FreqDistVisualizer
from yellowbrick.features import RadViz
from wordcloud import WordCloud
import plotly.io as pio
%matplotlib inline

#noise libaries 
import re
import string 
from stopwordsiso import stopwords as sw
from nltk.corpus import stopwords

# Text Preprocessing
from nltk.tokenize import TweetTokenizer
from nltk import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

# Feature Engineering and Data preparation for modelling
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

# Model building and training
from sklearn.svm import SVC 
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

#Model evaluation
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import log_loss
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score

#save the final model and vectorizer
import pickle

## 2. Loading the Datasets

In [29]:
# Load the training dataset
train = pd.read_csv('../input/edsa-climate-change-belief-analysis-2021/train.csv')
# Visualize the first ten rows
train.head()

In [30]:
# Load the test dataset
test = pd.read_csv('../input/edsa-climate-change-belief-analysis-2021/test.csv')
# Visualize the first ten rows
test.head()


##  3. Text Preprocessing - cleaning 

### 3.1. Noise Removal

In [31]:
#dataframe cleaning function
def text_preprocessing (dataframe):
    
    
    numercial_digits = r'\d+'
    twitter_handles = r'@(\w+)'
    url_address = r'(https?:\/\/(?:www\.)?[-a-zA-Z0-9@:%._+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}[-a-zA-Z0-9()@:%_+.~#?&/=]*)'
        
    
    #create copy to perform cleaning on df
    dataframe = dataframe.copy()
    
    #covert to lowercaps in message col
    low = lambda tweets: ''.join([tweet.lower() for tweet in tweets])
    dataframe['message'] =dataframe['message'].apply(low) # The low function is used convert values in the message column to lowercase
    
    #remove punctuation in message col
    punct = lambda tweets: ''.join([tweet for tweet in tweets if tweet not in string.punctuation])
    dataframe['message'] = dataframe['message'].apply(punct)# The punct function is used ot remove punctuation from the message column
    
    #remove non-ascii characters in message col
    dataframe.message.replace({r'[^\x00-\x7F]+':''}, regex=True, inplace=True)
    
    #remove website addresses in message col
    dataframe['message'] = dataframe['message'].replace(to_replace = url_address, value = '', regex = True)
    
    #remove numercial values in message col
    dataframe['message'] = dataframe['message'].replace(to_replace = numercial_digits, value = '', regex = True)
    
    #remove twitter handles in message col
    dataframe['message'] = dataframe['message'].replace(to_replace = twitter_handles, value = '', regex = True)
   
    return dataframe

In [32]:
#return training dataset from dataframe 
cleaned_train = text_preprocessing(train)

#show the first 8 rows of the cleaned train dataframe and show width for the message col
with pd.option_context('display.max_colwidth', 200):
    display(cleaned_train.head(8))

### 3.2. Stopwords removal and tokenisation(tzr)

In [33]:
#function that tokenizes words on df
def tokens(dataframe, column_name):
    
   #create copy to perform tokenization on df
    dataframe = dataframe.copy()
    
   #to remove redundant lettering in words using - reduce_len
    tzr = TweetTokenizer(reduce_len = True)
    
    dataframe[column_name] = dataframe[column_name].apply(tzr.tokenize)
    
    return dataframe

In [34]:
#return the cleaned training dataset dataframe and message column using the tokens func
token_train = tokens(cleaned_train, 'message')

#show the first 8 rows of the cleaned train dataframe and show width for the message col
with pd.option_context('display.max_colwidth', 200):
    display(token_train.head(8))

In [35]:
#funct that removes stop words and "rt" from df 
def stop(dataframe, column_name):
   
    #create copy to perform stopwords on df
    dataframe = dataframe.copy()
    
    # funct that returns tokenized words which are not 'rt'
    rt = lambda tweets: [tweet for tweet in tweets if tweet != 'rt']
    
    # funct that removes all 'rt'instances on df
    dataframe[column_name] = dataframe[column_name].apply(rt)
    
    #funct that returns the words in a tokenized df that do not appear in a stopwords set
    stops = lambda tweets: [tweet for tweet in tweets if tweet not in stopwords('english')]
    
    #funct removing all stopwords on the specified column,
    dataframe[column_name] = dataframe[column_name].apply(stops)
    
    return dataframe

In [36]:
# Call the stop function on the tokenized training dataset dataframe, creating a new training dataframe without stopwords and rts
stops_train = stop(token_train, 'message')

# Display the first 10 rows of the training dataframe without stopwords and rts, allowing maximum width for the message column
with pd.option_context('display.max_colwidth', 400):
    display(stops_train.head(10))

## 3.3. Lemmatizaton

In [None]:
lemmatizer = WordNetLemmatizer()

#create a new dataframe to be lemmatized(train)
lemmatized_train = stop_train.copy()

#use lemmatizer on the message col of the train dataframe
lemmatized_train['message'] = stop_train['message'].apply(lambda sentence : [lemmatizer.lemmatize(word) for word in sentence])


In [None]:
#show the first 8 rows of the train dataframe and show width for the message col
with pd.option_context('display.max_colwidth', 200):
    display(lemmatized_train.head(8))

In [None]:
#create a new dataframe to be lemmatized(test)
lemmatized_test = stop_test.copy()

#use lemmatizer on the message column of the test dataframe
lemmatized_test['message'] = stop_test['message'].apply(lambda sentence: [lemmatizer.lemmatize(word) for word in sentence])

#show the first 8 rows of the test dataframe and show width for the message col
with pd.option_context('display.max_colwidth', 200):
    display(lemmatized_test.head(8))

In [None]:
#combine the tokenized words and lemmatized_train message col a sentence
lemmatized_train['message'] = [' '.join(tweet) for tweet in lemmatized_train['message'].values]

#show the first 8 rows of the train dataframe and show width for the message col
with pd.option_context('display.max_colwidth', 400):
    display(lemmatized_train.head(10))

In [None]:
#combine the tokenized words and lemmatized_test message col a sentence 
lemmatized_test['message'] = [' '.join(tweet) for tweet in lemmatized_test['message'].values]

#show the first 8 rows of the train dataframe and show width for the message col
with pd.option_context('display.max_colwidth', 200):
    display(lemmatized_test.head(8))

## 4. Exploratory data analysis

### 4.1 Predictor Analysis

In [None]:
#funct extracting hashtags from df
def hashtag_extract(message):
   
    #creat an empty list to be used to gather hashtag
    hashtag = []
    
    #In every word in message col find words that start with '#'(hashtag)
    for tweet in message: 
        hasht = re.findall(r"#(\w+)", tweet) 
        hashtag.append(hasht) 
        
    return hashtag

In [None]:
# Use the hashtags extract function to get hashtags associated with the positive class from the original training dataframe
positive_hashtag = hashtag_extract(train['message'][train['sentiment'] == 1])

# Use the hashtags extract function to get hashtags associated with the negative class from the original training dataframe
negative_hashtag = hashtag_extract(train['message'][train['sentiment'] == -1])

# Use the hashtags extract function to get hashtags associated with the neutral class from the original training dataframe
neutral_hashtag = hashtag_extract(train['message'][train['sentiment'] == 0])

# Use the hashtags extract function to get hashtags associated with the news class from the original training dataframe
news_hashtag = hashtag_extract(train['message'][train['sentiment'] == 2])

In [None]:
# Create a single list for the positive sentiment
positive_hashtags = sum(positive_hashtags, [])

# Create a single list for the negative sentiment
negative_hashtags = sum(negative_hashtags, [])

# Create a single list for the neutral sentiment
neutral_hashtags = sum(neutral_hashtags, [])

# Create a single list for the news sentiment
news_hashtags = sum(news_hashtags, [])

In [None]:
# Create a distribution plot of the most frequent hashtags in the positive hashtags list 
freq = nltk.FreqDist(positive_hashtags)

# Create a dataframe from the result of the frequency distribution plot, using the hashtags in one column and frequencies
# in a second
df = pd.DataFrame({'Hashtags' : list(freq.keys()),
                   'Count' : list(freq.values())})

# Sort the hashtags by order of descending counts and show the first 10 rows i.e the 10 most frequent hashtags
df_pos = df.sort_values(by = 'Count', ascending = False)
df_pos.head(10)

In [None]:
# Create a bar plot using the group dataframe  to visualise the number of tweets per class
fig = go.Figure(go.Bar(x = df_pos['Hashtags'].head(10), 
                       y = df_pos['Count'].head(10).sort_values(ascending = False), # Specify x and y variables
                       marker = {'color': df_pos['Count'],'colorscale': 'viridis'})) # Select a colour for the graph

# Add title, x and y axis labels to the bar chart
fig.update_layout(yaxis_title = 'Hashtag Counts', xaxis_title = 'Hashtags', 
                  title = 'Count of Hashtags for Positive Man-Made Climate Change Sentiment')

# Show the bar plot
fig.show()

In [None]:
# Create a distribution plot of the most frequent hashtags in the negative hashtags list
freq = nltk.FreqDist(negative_hashtags)

# Create a dataframe the results of the frequency distribution plot , using the hashtags in the one column and frequencies
# in a second
df = pd.DataFrame({'Hashtags' : list(freq.keys()),
                   'Count' : list(freq.values())})

# Sort the hashtags by order descending counts and show the first 10 rows i.e the 10 most frequent hashtags
df_neg = df.sort_values(by = 'Count', ascending = False)
df_neg.head(10)

In [None]:
# Create a bar plot using the group dataframe  to visualise the number of tweets per class
fig = go.Figure(go.Bar(x = df_neg['Hashtags'].head(10), 
                       y = df_neg['Count'].head(10).sort_values(ascending = False), # Specify x and y variables
                       marker = {'color': df_neg['Count'],'colorscale': 'viridis'})) # Select a colour for the graph

# Add title, x and y axis labels to the bar chart
fig.update_layout(yaxis_title = 'Hashtag Counts', xaxis_title = 'Hashtags', 
                  title = 'Count of Hashtags for Negative Man-Made Climate Change Sentiment')

# Show the bar plot
fig.show()

In [None]:
# Create a distribution plot of the most frequent hashtags in neutral hashtags list
freq = nltk.FreqDist(neutral_hashtags)

# Create a dataframe the results of the frequency distribution plot, using the hashtags in the one column and frequencies
# in a second
df = pd.DataFrame({'Hashtags' : list(freq.keys()),
                   'Count' : list(freq.values())})
# Sort the hashtags by order descending counts and show the first 10 rows i.e the 10 most frequent hashtags
df_neu = df.sort_values(by = "Count", ascending = False)
df_neu.head(10)

In [None]:
# Create a bar plot using the group dataframe  to visualise the number of tweets per class
fig = go.Figure(go.Bar(x = df_neu['Hashtags'].head(10), 
                       y = df_neu['Count'].head(10).sort_values(ascending = False), # Specify x and y variables
                       marker = {'color': df_neu['Count'],'colorscale': 'viridis'})) # Select a colour for the graph

# Add title, x and y axis labels to the bar chart
fig.update_layout(yaxis_title = 'Hashtag Counts', xaxis_title = 'Hashtags', 
                  title = 'Count of Hashtags for Neutral Man-Made Climate Change Sentiment')

# Show the bar plot
fig.show()

In [None]:
# Create a distribution plot of the most frequent hashtags in news hashtags list
freq = nltk.FreqDist(news_hashtags)

# Create a dataframe the results of the frequency distribution plot, using the hashtags in the one column and frequencies
# in a second
df = pd.DataFrame({'Hashtags' : list(freq.keys()),
                   'Count' : list(freq.values())})

# Sort the hashtags by order descending counts and show the first 10 rows i.e the 10 most frequent hashtags
df_news = df.sort_values(by = 'Count', ascending = False)
df_news.head(10)

In [None]:
# Create a bar plot using the group dataframe  to visualise the number of tweets per class
fig = go.Figure(go.Bar(x = df_news['Hashtags'].head(10), 
                       y = df_news['Count'].head(10), # Specify x and y variables
                       marker = {'color': df_news['Count'],'colorscale': 'viridis'})) # Select a colour for the graph

# Add title, x and y axis labels to the bar chart
fig.update_layout(yaxis_title = 'Hashtag Counts', xaxis_title = 'Hashtags', 
                  title = 'Count of Hashtags for News Climate Change Sentiment')

# Show the bar plot
fig.show()

In [None]:
# Use the clean, preprocessed training data in order to visualise the most frequently use non-stopword words for each sentiment

# Create function to exclude subject matter words such as climate, change, global, and warming
extractor = lambda words:  " ".join([word for word in words.split() if word not in climate_list])

# Extract, from the training dataset, all the tweets written by individuals who were pro man-made climate change
data_pos = lemmatized_train[lemmatized_train['sentiment'] == 1] 
data_pos = data_pos['message'].apply(extractor) # Extract the message column which holds just the tweets in this sentiment

# Extract, from the training dataset, all the tweets written by individuals who were anti man-made climate change
data_neg = lemmatized_train[lemmatized_train['sentiment'] == -1]
data_neg = data_neg['message'].apply(extractor)# Extract the message column which holds just the tweets in this sentiment

# Extract, from the training dataset, all the tweets written by individuals who were neutral on man-made climate change
data_neutral = lemmatized_train[lemmatized_train['sentiment'] == 0]
data_neutral = data_neutral['message'].apply(extractor) # Extract the message column which holds just the tweets in this sentiment

# Extract, from the training dataset, all the tweets related to news about climate change
data_news = lemmatized_train[lemmatized_train['sentiment'] == 2]
data_news = data_news['message'].apply(extractor) # Extract the message column which holds just the tweets in this sentiment

In [None]:
# Instantiate CountVectorizer as cv_pos
cv_pos = CountVectorizer()
docs_pos = cv_pos.fit_transform(data_pos) # Fit cv_pos to the series containing tweets associated with the positive sentiment
features_pos = cv_pos.get_feature_names() # Get the words used in the positive sentiment

# Instantiate CountVectorizer as cv_neg
cv_neg = CountVectorizer()
docs_neg = cv_neg.fit_transform(data_neg) # Fit cv_neg to the series containing tweets associated with the negative sentiment
features_neg = cv_neg.get_feature_names()# Get the words used in the negative sentiment

# Instantiate CountVectorizer as cv_neutral
cv_neutral = CountVectorizer()
# Fit cv_neutral to the series containing tweets associated with the neutral sentiment
docs_neutral = cv_neutral.fit_transform(data_neutral)  
features_neutral = cv_neutral.get_feature_names() # Get words used in the neutral sentiment

# Instantiate CountVectorizer as cv_news
cv_news = CountVectorizer()
docs_news = cv_news.fit_transform(data_news) # Fit cv_news to the series containing tweets associated with the news sentiment
features_news = cv_news.get_feature_names() # Get words used in the negative sentiment

In [None]:
# Create a frequency distribution plot to display the most frequent words associated with the positive sentiment
viz_pos = FreqDistVisualizer(features = features_pos, orient = 'v', n = 5, color = 'green',
                            title = 'Frequency Distribution of Top 10 tokens for Pro sentiment')
#Enlargen the plot
visualizer_pos = RadViz(classes = docs_pos, features = features_pos, size = (1080, 720))

# Fit the vectorized bag of words to the instantiated frequency distribution plot
viz_pos.fit(docs_pos)
viz_pos.show() # Display the graph of positive sentiment words

# Create a frequency distribution plot to display the most frequent words associated with the negative sentiment
viz_neg = FreqDistVisualizer(features = features_neg, orient = 'v', n = 5, color = 'red',
                            title = 'Frequency Distribution of Top 10 tokens for Anti sentiment')
# Enlargen the plot
visualizer_pos = RadViz(classes = docs_pos, features = features_pos, size = (1080, 720))

# Fit the vectorized bag of words to the instantiated frequency distribution plot
viz_neg.fit(docs_neg)
viz_neg.show() # Display the graph of negative sentiment words

# Create a frequency distribution plot to display the most frequent words associated with the neutral sentiment
viz_neutral = FreqDistVisualizer(features = features_neutral, orient = 'v', n = 5, color = 'yellow',
                                title = 'Frequency Distribution of Top 10 tokens for Neutral sentiment')
#Enalargen the plot
visualizer_neutral = RadViz(classes = docs_pos, features = features_pos, size = (1080, 720))

# Fit the vectorized bag of words to the instantiated frequency distribution plot
viz_neutral.fit(docs_neutral)
viz_neutral.show() # Display the graph of neutral sentiment words

# Create a frequency distribution plot to display the most frequent words associated with the news sentiment
viz_news = FreqDistVisualizer(features = features_news, orient = 'v', n = 5, color = 'purple',
                             title = 'Frequency Distribution of Top 10 tokens for News class')
# Enlargen the plot
visualizer_pnews = RadViz(classes = docs_pos, features = features_pos, size = (1080, 720))

# Fit the vectorized bag of words to the instantiated frequency distribution plot
viz_news.fit(docs_news)
viz_news.show() # Display the graph of news sentiment words

### 4.2 Label Analysis

In [None]:
# Create a new dataframe by grouping tweets by sentiment and counting the number of tweets in each sentiment
group = lemmatized_train.groupby('sentiment').count()['message'].reset_index().sort_values(by = 'message', ascending = False)

# Display the new dataframe.  The sentiment column displays the four classes, 
# while the message column display the number of tweets per class
group

In [None]:
# Create a bar plot using the group dataframe  to visualise the number of tweets per class
fig = go.Figure(go.Bar(x = ['Positive', 'News', 'Neutral', 'Negative'],y = group['message'], # Specify x and y variables
                       marker = {'color': group['message'],'colorscale': 'plasma'})) # Select a colour for the graph

# Add title, x and y axis labels to the bar chart
fig.update_layout(yaxis_title = 'Tweets', xaxis_title = 'Sentiment', title = 'Number of Tweets Per Sentiment')

# Show the bar plot
fig.show()

In [None]:
# Create a funnel chart indicating the numbers of tweets per class as a proportion
fig = go.Figure(go.Funnelarea(text = ['Positive', 'News', 'Neutral', 'Negative'], values = group['message'], 
                              marker = {'colors': group['message']}, # column to colour chart on
                              title = {"position": "top center", "text": "Funnel-Chart of Sentiment Distribution"}, # Chart Title
                              labels = ['Positive', 'News', 'Neutral', 'Negative'])) # Legend labels

# Show the funnel chart
fig.show()

In [None]:
# Collect tweets from the lemmatized_train dataframe written by pro man-made climate change individuals
positive_words = " ".join([sentence for sentence in lemmatized_train['message'][lemmatized_train['sentiment'] == 1]])

# Instantiate  the wordcloud and use positive_words to generate a wordcloud for the 'pro' sentiment
positive_wordcloud = WordCloud(width = 800, height = 500, random_state = 42, max_font_size = 100, 
                               background_color = 'white').generate(positive_words)

# Plot the positive wordcloud on an empty axis and use plt.show() to display it
ax1 = plt.imshow(positive_wordcloud)
plt.axis('off')
plt.show()

# Collect tweets from the lemmatized_train dataframe written by anti man-made climate change individuals
negative_words = " ".join([sentence for sentence in lemmatized_train['message'][lemmatized_train['sentiment'] == -1]])

# Instantiate the wordcloud  and use negative_words to generate a wordcloud for the anti sentiment
negative_wordcloud = WordCloud(width = 800, height = 500, random_state = 42, max_font_size = 100, 
                               background_color = 'white').generate(negative_words)

# Plot the negative wordcloud on an empty axis and use plt.show() to display it
ax2 = plt.imshow(negative_wordcloud)
plt.axis('off')
plt.show()

# Collect tweets from the lemmatized_train dataframe written by individuals with neutral views on man-made climate change
neutral_words = " ".join([sentence for sentence in lemmatized_train['message'][lemmatized_train['sentiment'] == 0]])

# Instantiate the wordcloud and use neutral words to generate a wordcloud for the neutral sentiment
neutral_wordcloud = WordCloud(width = 800, height = 500, random_state = 42, max_font_size = 100, 
                              background_color = 'white').generate(neutral_words)

# Plot the neutral wordcloud on an empty axis and use plt.show() to display it
ax3 = plt.imshow(neutral_wordcloud)
plt.axis('off')
plt.show()

# Collect tweets from the lemmatized_train dataframe that link to actual news stories related to climate change
news_words = " ".join([sentence for sentence in lemmatized_train['message'][lemmatized_train['sentiment'] == 2]])

# Instantiate wordcloud and use news_words to generate a wordcloud for the news
news_wordcloud = WordCloud(width = 800, height = 500, random_state = 42, max_font_size = 100, 
                           background_color = 'white').generate(news_words)

# Plot the news wordcloud on an empty axis and use plt.show() to display it
ax4 = plt.imshow(news_wordcloud)
plt.axis('off')
plt.show()

In [None]:
# Create a list of the four most frequent words in the wordclouds
climate_list = ['climate', 'change', 'global', 'warming']

# Collect tweets from positive_words excluding the frequent words from the wordclouds
new_positive = " ".join([word for word in positive_words.split() if word not in climate_list])

# Instantiate  the wordcloud and use positive_words to generate a wordcloud for the 'pro' sentiment
pos_wc = WordCloud(width = 800, height = 500, random_state = 42, max_font_size = 100, 
                   background_color = 'white').generate(new_positive)

# Plot the positive wordcloud on an empty axis and use plt.show() to display it
ax1 = plt.imshow(pos_wc)
plt.axis('off')
plt.show()

# Collect tweets from negative_words excluding the frequent words from the wordclouds
new_negative = " ".join([word for word in negative_words.split() if word not in climate_list])

# Instantiate the wordcloud  and use negative_words to generate a wordcloud for the anti sentiment
neg_wc = WordCloud(width = 800, height = 500, random_state = 42, max_font_size = 100, 
                               background_color = 'white').generate(new_negative)

# Plot the negative wordcloud on an empty axis and use plt.show() to display it
ax2 = plt.imshow(neg_wc)
plt.axis('off')
plt.show()

# Collect tweets from neutral_words excluding the frequent words from the wordclouds
new_neutral = " ".join([word for word in neutral_words.split() if word not in climate_list])

# Instantiate the wordcloud and use neutral words to generate a wordcloud for the neutral sentiment
neu_wc = WordCloud(width = 800, height = 500, random_state = 42, max_font_size = 100, 
                              background_color = 'white').generate(new_neutral)

# Plot the neutral wordcloud on an empty axis and use plt.show() to display it
ax3 = plt.imshow(neu_wc)
plt.axis('off')
plt.show()

# Collect tweets from news_words excluding the frequent words from the wordclouds
new_news = " ".join([word for word in news_words.split() if word not in climate_list])

# Instantiate wordcloud and use news_words to generate a wordcloud for the news
new_wc = WordCloud(width = 800, height = 500, random_state = 42, max_font_size = 100, 
                           background_color = 'white').generate(new_news)

# Plot the news wordcloud on an empty axis and use plt.show() to display it
ax4 = plt.imshow(new_wc)
plt.axis('off')
plt.show()

## 5. Feature Extraction

In [None]:
# Create a function that cleans the training data and prepares it for modelling
def preprocessing(string):
    """This function takes a sentence and transforms it to lowercase using the lower() string method, it then removed urls,
       numerical values, punctuation, and rts (retweets) using regex patterns.  The function also use TweetTokenizer from the
       nltk.tokenize library in order to remove twitter handles
       
       Parameters
       ----------
       string : str
           A sentence string which is to go through text cleaning
           
       Returns
       -------
       str
           A string which has been cleaned of noise"""
    
    # Change the casing in the inputted string to lowercase
    string = string.lower()
    
    # Remove url addresses from the string
    string = re.sub(r"http\S+", "", string)
    
    # Instantiate TweetTokenizer with an argument that allows for the stripping of twitter handles
    tknzr = TweetTokenizer(strip_handles = True)
    
    # Tokenize the string using TweetTokenizer in order to remove twitter handles
    string = tknzr.tokenize(string)
    
    # Join the tokenized words together into sentences 
    string = " ".join(string)
    
    # Remove punctuation from the string 
    string = re.sub(r'[^a-z0-9\s]', '', string)
    string = re.sub(r'[0-9]+', '', string) # replace numbers or number like words with 'number'
    
    # Remove rt from the string
    message = re.sub(r'^rt', '', string)
    
    # Return a new string which has been cleaned of noise
    return message

### 5.1. CountVectorizer

In [None]:
# Create a copy our the training data set, train_data, which will be used to build the models
train_data = train.copy()

# Call the created function preprocessing on train_data dataframe message column in order to clear the tweets of noise 
train_data['message']= train_data['message'].apply(preprocessing)

In [None]:
# The label, y, is defined as the sentiment column in the dataframe, train_data
y = train_data.sentiment

# The predictors, X, are defined as the message column in the dataframe, train_data
predictors = train_data.message 

# View the shape of the label and predictors
print(predictors.shape) # predictors
print(y.shape) # label

In [None]:
# Instantiate CountVectorizer with the ngrams argument as cv
cv = CountVectorizer(ngram_range =(1,2))

# Use cv to vectorize the text data in the message columnn of the dataframe, creating a new vector, X
X = cv.fit_transform(predictors)

# View the shape of vectorized the sparse matrix
print("The predictors have the shape:", X.shape)

### 6.2. Train Test Split

In [None]:
# Train test split is called on the variables X and y to create a training and validation set
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [None]:
# Print the shape of the X training variable, X_train 
print(f'The X training variable has the shape: {X_train.shape}')

# Print the shape of the y training variable, y_train
print(f'The y training variable has the shape: {y_train.shape}')

# Print the shape of the X validation variable, X_val
print(f'The X validation variable has the shape: {X_val.shape}')

#Print the shape of the y validation variable, y_val
print(f'The y validation variable has the shape: {y_val.shape}')

## 7. Model Training

### 7.1. Logistic Regression Classifier

In [None]:
# Instantiate the Logistic Regression model as logreg
logreg = LogisticRegression(multi_class = 'ovr', solver = 'liblinear', random_state = 42) 

In [None]:
# Fit the model to our training data
logreg.fit(X_train, y_train)

In [None]:
# Make predictions on the validation set
y_logreg = logreg.predict(X_val)

#### 7.1.1. Model Performance

Model Accuracy Metrics Used
- Confusion Matrix
- Classification Report
- F1_score 

In [None]:
# Define the labels to be used in the confusion matrix
# Confusion matrix
true_labels = ['true : Anti', 'true : Neutral', 'true : Pro', 'true : News']
pred_labels = ['pred : Anti', 'pred : Neutral', 'pred : Pro', 'pred : News']
type_labels = ['-1 : Anti', '0 : Neutral', '1 : Pro', '2 : News']

In [None]:
# Create the confusion matrix dataframe to visualise the number of correctly predicted observations
pd.DataFrame(data = confusion_matrix(y_val, y_logreg), index = true_labels, columns = pred_labels)

In [None]:
#A classification report shows the precision, recall and f1 scores of the model's performance
# Create a classification report from the validation set
logreg_report = classification_report(y_val, y_logreg, target_names=type_labels)

# Print out the classification report 
print(logreg_report)

In [None]:
#f1 score
f1_score_logreg = f1_score(y_val,y_logreg,average ="weighted") 
print(f'This is the accuracy for the basic LogisticRegression classifier: {f1_score_logreg}')

### 7.2. SMOTE Logistic Regression Classifier

In [None]:
# Instantiate SMOTE as sm
sm = SMOTE(random_state = 42) 

In [None]:
# Fit the SMOTE model on the training data
X_res, y_res = sm.fit_resample(X_train, y_train) 

In [None]:
# Instantiate a Logistic Regression model to use with the resampled data as logreg_smote 
logreg_smote = LogisticRegression(multi_class = 'ovr', solver = 'liblinear', random_state = 42)

In [None]:
# Fit the logistic regression model on the resampled data
logreg_smote.fit(X_res, y_res)

In [None]:
# Predict on the validation set
y_logreg_smote = logreg_smote.predict(X_val)

#### 7.1.2. Model Performance 

In [None]:
#Confusion matrix
# Create a confusion matrix on the validation data
pd.DataFrame(data = confusion_matrix(y_val, y_logreg_smote), index = true_labels, columns = pred_labels)

In [None]:
#classification report
#classification report on the validation set
smote_report = classification_report(y_val, y_logreg_smote, target_names = type_labels)

# Print the classification report
print(smote_report)

In [None]:
#f1 score
# Calcuate the f1 score on the validation set
f1_score_logreg_smote = f1_score(y_val, y_logreg_smote, average = "weighted")

# Print the f1 score
print(f'This is the accuracy for the basic LogisticRegression classifier: {f1_score_logreg_smote}')

In [None]:
# comparison basic logistic regression model vs the smote logistic regression model
# Create a dataframe to show the f1 scores of the basic logistic regression model vs the smote logistic regression model

# Create a list of f1 scores
f1_scores = [f1_score_logreg, f1_score_logreg_smote]

# Create a list to use as row labels
models = ["logreg(no smote)","logreg(with smote)"] 

# Create metric column name list
metrics = ['f1_score']

# Create the dataframe
model_df = pd.DataFrame(data = f1_scores, index = models, columns = metrics)

# Display the first 5 rows of the new dataframe
model_df.head()

###  7.3. Support Vector Classifier

In [None]:
# Instantiate the support vector classifier model as svm_clf 
svm_clf = SVC(C = 10, gamma = 0.01)

In [None]:
# Fit the model on the training data
svm_clf.fit(X_train, y_train)

In [None]:
# Predict on the validation set
y_svm_CV = svm_clf.predict(X_val)


#### 7.2.1. Model Performance

In [None]:
# Create a dataframe to display the confusion matrix results
pd.DataFrame(data = confusion_matrix(y_val, y_svm_CV), index = true_labels, columns = pred_labels)

In [None]:
# Print the classification report the validation set
print(classification_report(y_val, y_svm_CV, target_names = type_labels))

In [None]:
# Calculate the f1 score on the validation set
f1_score_svm = f1_score(y_val, y_svm_CV, average = "weighted") 

# Print the f1 score
print(f'This is the accuracy for the basic Support Vector Machine classifier: {f1_score_svm}'

### 7.3. SMOTE Support Vector Classifier

In [None]:
# Instantiate a support vector classifier model to use on the resampled data
svm_clf_CV_smote = SVC(C = 10, gamma = 0.01)

# Fit the model on the resampled data
svm_clf_CV_smote.fit(X_res,y_res)

In [None]:
# Predict using the validation set
y_pred_svm_CV_smote = svm_clf_CV_smote.predict(X_val)

#### 7.3.1. Model Performance

In [None]:
# Create a dataframe to display the confusion matrix results
pd.DataFrame(data = confusion_matrix(y_val, y_pred_svm_CV_smote), index = true_labels, columns = pred_labels)

In [None]:
# Print the classification report the validation set
print(classification_report(y_val, y_pred_svm_CV_smote, target_names = type_labels))

In [None]:
# Calculate the f1 score on the validation set
f1_svm_CV_smote = f1_score(y_val, y_pred_svm_CV_smote, average = "weighted") 

# Print the f1 score
print(f'This is the accuracy for the basic Support Vector Machine classifier: {f1_score_svm}')

In [None]:
# Create a dataframe to show the f1 scores of the basic support vector classifier model vs the smote support vector classifier
# model

# Create a list of f1 scores
f1_scores = [f1_score_svm, f1_svm_CV_smote]

# Create a list to use as row labels
models = ["svc(no smote)","svc(with smote)"] 

# Create metric column name list
metrics = ['f1_score']

# Create the dataframe
model_df = pd.DataFrame(data = f1_scores, index = models, columns = metrics)

# Display the first 5 rows of the new dataframe
model_df.head()

### 7.3. Naive Bayes Classifier

In [None]:
# Instantiate the naive bayes model as nb
nb = MultinomialNB() 

In [None]:
# Fit the model on the training data
nb.fit(X_train, y_train)

In [None]:
# Predict on the validation data
pred_nb = nb.predict(X_val)

#### 7.3.1. Model Performance

In [None]:
# Create a dataframe to display the confusion matrix results
pd.DataFrame(data = confusion_matrix(y_val, pred_nb), index = true_labels, columns = pred_labels)

In [None]:
# Print the classification on the validation set
print(classification_report(y_val, pred_nb, target_names = type_labels))

In [None]:
# Calculate the f1 score on the validation set
f1_nb = f1_score(y_val, pred_nb, average = "weighted") 

# Print the f1 score
print(f'This is the accuracy for the Naive Bayes classifier: {f1_score_svm}')

### 7.4. SMOTE Naive Bayes Classifier

In [None]:
# Instantiate the naive bayes classifier as nb_smote to use on the resampled data
nb_smote = MultinomialNB()

In [None]:
# Fit the model on the resampled data
nb_smote.fit(X_res, y_res)

In [None]:
# Predict of the validation set
pred_nbsmote = nb_smote.predict(X_val)

#### 7.4.1. Model Performance

In [None]:
# Create a dataframe to display the confusion matrix results
pd.DataFrame(data = confusion_matrix(y_val, pred_nbsmote), index = true_labels, columns = pred_labels)

In [None]:
# Print the classification report of the validation set
print(classification_report(y_val, pred_nbsmote, target_names = type_labels))

In [None]:
# Calculate the f1 score on the validation set
f1_nb_smote = f1_score(y_val, pred_nbsmote, average = "weighted") 

# Print the f1 score
print(f'This is the accuracy for the Naive Bayes classifier: {f1_score_svm}')

In [None]:
# Create a dataframe to show the f1 scores of the Naive Bayes classifier model vs the smote Naive Bayes classifier
# model

# Create a list of f1 scores
f1_scores = [f1_nb, f1_nb_smote]

# Create a list to use as row labels
models = ["NB (no smote)","NB (with smote)"] 

# Create metric column name list
metrics = ['f1_score']

# Create the dataframe
model_df = pd.DataFrame(data = f1_scores, index = models, columns = metrics)

# Display the first 5 rows of the new dataframe
model_df.head()

# Model Validation

### K-fold cross validation

In [None]:
# define classifiers that we'll apply cross validation to
clf1 = logreg
clf2 = svm_clf
clf3 = nb

# create empty list where we'll append the f1 score's mean and std of each classifier obtained through cross validation
cross_val = []

# loop through list of classifiers and apply cross_val_score function
for clf, label in zip([clf1,clf2,clf3],['Logistic Regresion','SVM', 'Naive Bayes']):
    print(label)
    scores = cross_val_score(clf, X, y, cv=5, scoring = 'f1_micro')
    print("f1 score: {:0.4f} (+/- {:0.4f})".format(scores.mean(), scores.std()))
    cross_val.append([label, scores.mean(), scores.std() ]) # append the scores to the empty list created abov

In [None]:
# convert list of cv scores to dataframe
cross_val = pd.DataFrame(cross_val, columns=['Model', 'CV_Mean', 'CV_Std_Dev'])
cross_val.set_index('Model', inplace=True) # set index to the name of model

In [None]:
# view our dataframe containing cross valiation mean and std
cross_val

## 8. Test Data

In [None]:
# Create a copy of the test to perform noise cleaning and vectorization on
test_data = test.copy()

In [None]:
# Call the data cleaning function the test_data dataframe to remove noise in preparation for cleaning
test_data['message'] = test_data['message'].apply(preprocessing)

In [None]:
# Use CountVector in order to encode the words in the clean test dataframe
test_cv_trans =  cv.transform(test_data['message'])

# Print out the shape of the newly vectorized dataframe
print("The shape of the data is:", test_cv_trans.shape)

### 8.1. Logistic Regression

In [None]:
# predict on test_data
y_pred_logreg_sub = logreg.predict(test_cv_trans)

#### 8.1.1. Logistic Regression Submission

In [None]:
# Extract the tweetid column from test_data to use as the submission file index
tweetid = test_data['tweetid']

In [None]:
# Create a dataframe using the test_data tweetid and and the predicted sentiment
submission_logreg = pd.DataFrame({'tweetid' : tweetid, 'sentiment' : y_pred_logreg_sub})

In [None]:
# Display the last 5 rows of the submission file
submission_logreg.tail()

In [None]:
# Save dataframe to a csv file for submission
submission_logreg.to_csv("AM3_logreg_predictions.csv", index = False)

### 8.2. Support Vector

In [None]:
# predict on test_data
y_pred_svm_sub = svm_clf.predict(test_cv_trans)

#### 8.2.1 Support Vector Submission

In [None]:
# Create a dataframe using the test_data tweetid and and the predicted sentiment
submission_svc = pd.DataFrame({'tweetid' : tweetid, 'sentiment' : y_pred_svm_sub})

In [None]:
# Display the last 5 rows of the submission file
submission_svc.tail()

In [None]:
# Save dataframe to a csv file for submission
submission_svc.to_csv("AM3_SVC_predictions.csv", index = False)

### 8.3. Naive Bayes

In [None]:
y_pred_nb = nb.predict(test_cv_trans)

#### 8.3.1. Naive Bayes Submission

In [None]:
# Create a dataframe using the test_data tweetid and and the predicted sentiment
submission_nb = pd.DataFrame({'tweetid' : tweetid, 'sentiment' : y_pred_nb})

In [None]:
# Display the last 5 rows of the submission file
submission_nb.tail()

In [None]:
# Save dataframe to a csv file for submission
submission_nb.to_csv("AM3_nb_predictions.csv", index = False)

## 9. Conclusion

### 9.1. Model Findings

In [None]:
# Make predictions on the train set
train_pred_logreg = logreg.predict(X_train)
f1_score_train_logreg = f1_score(y_train,train_pred_logreg,average = "weighted") # Train set

In [None]:
# Predict on the validation set
train_pred_logreg_smote = logreg_smote.predict(X_train) # Train_set
f1_score_train_logreg_smote =f1_score(y_train,train_pred_logreg_smote,average = "weighted" )

In [None]:
# Predict on the train set 
train_pred_svm_CV = svm_clf.predict(X_train)
# Calculate the f1 score on the train set
f1_score_train_svm = f1_score(y_train, train_pred_svm_CV,average = "weighted")

In [None]:
# Predict using the validation set
train_pred_svm_smote = svm_clf_CV_smote.predict(X_train)
# Calculate the f1 score on the train set
f1_score_train_svm_smote = f1_score(y_train,train_pred_svm_smote, average = "weighted") 

In [None]:
# Predict on the train data
train_pred_nb = nb.predict(X_train)

# Calculate the f1 score on the train set
f1_score_train_nb = f1_score(y_train,train_pred_nb, average = "weighted") 


In [None]:
# Predict of the train set
train_pred_nb_smote = nb_smote.predict(X_train)

# Calculate the f1 score on the train set
f1_score_train_nb_smote = f1_score(y_train,train_pred_nb_smote, average = "weighted") 

In [None]:
 
    """
    Create a dataframe to show the 
    f1 score of the basic models vs the smote models
    
    """
# Create a list of f1 scores
f1_scores_pred = [f1_score_logreg, 
                  f1_score_logreg_smote, 
                  f1_score_svm,
                  f1_svm_CV_smote, 
                  f1_nb,
                  f1_nb_smote]

#Create a list train f1_score
f1_scores_train = [f1_score_train_logreg, f1_score_train_logreg_smote, 
                    f1_score_train_svm, f1_score_train_svm_smote,
                    f1_score_train_nb,  f1_score_train_nb_smote]

# Create a list to use as row labels
models2 = ["LogisticRegression", "LogisticRegression (with SMOTE)",
          "Support Vector Machine", "Support Vector Machine(with SMOTE)",
          " Multinomial_NB", "Multinomial_NB(with SMOTE)"] 

# Create a dictionary of the the f1 scores 
f1_scores_dict = {'model': models,
                  'f1_score_train': f1_scores_train,
                  'f1_score_pred' : f1_scores_pred,
                  }






# Create the dataframe
models_df = pd.DataFrame.from_dict(f1_scores_dict)

#setting the index
models_df = models_df.set_index('model')


# Display the first 5 rows of the new dataframe
models_df

### F1_score analysis

In [None]:
# Plot a bargraph from the data 
models_df.plot.barh(color={"f1_score_train": "red", "f1_score_pred": "green"},figsize = (15,10)).legend( title="Index", 
                    ncol= 2,
                    prop={'size': 11})
plt.title("F1 scores for each model compared to Train Data set") # Set title
plt.xlabel("Models") # Set x label
plt.ylabel("F1 score")# Set y label
plt.show() # plot the graph 

In [None]:
from sklearn.metrics import roc_curve, auc #import the necessary metrics
"""
    This function is to find the ROC_AUC curve and AUC.
    This function also plots the necessary plots.
    Takes in the function(clf), Validations sets(X_test,y_test), number of classes(n_classes) , 
    size of your figure(figsize= tuple of integers) and a figure as ax = none 
"""

def plot_multiclass_roc(clf, X_test, y_test, n_classes, figsize=(17, 6), ax = None):
    y_score = clf.predict_proba(X_test) #perform a decision_test function

    # Create empty dictionary for the structures
    fpr = dict()
    tpr = dict()
    roc_auc = dict()

    # calculate dummies once
    y_test_dummies = pd.get_dummies(y_test, drop_first=False).values
    
    # Loop through classes to get the FPR,TPR
    for i in range(n_classes):
        fpr[i], tpr[i], _ = roc_curve(y_test_dummies[:, i], y_score[:,i])
        roc_auc[i] = auc(fpr[i], tpr[i])
    
    # Instatiate a graph
    if(ax is None):
        fig, ax = plt.subplots(figsize=figsize)
    
     #  plot the roc for each class 
    ax.plot(fpr[0], tpr[0], label= f'ROC curve (area = {roc_auc[0]:.2f}) for Class 2')
    ax.plot(fpr[1], tpr[1], label=f'ROC curve (area = {roc_auc[1]:.2f}) for Class -1')    
    ax.plot(fpr[2], tpr[2], label=f'ROC curve (area = {roc_auc[2]:.2f}) for Class 0')
    ax.plot(fpr[3], tpr[3], label=f'ROC curve (area = {roc_auc[3]:.2f}) for Class 1')
    
  
    
    ax.plot([0, 1], [0, 1], 'k--')
    ax.set_xlim([0.0, 1.0]) # set x 
    ax.set_ylim([0.0, 1.05])# set y
    ax.set_xlabel('False Positive Rate') # Set x label
    ax.set_ylabel('True Positive Rate') # Set y label
    ax.legend(loc="best") # Set legend postion
    
    ax.grid(alpha=.4)
    return ax

**LOGISTIC REGRESSION CLASSIFIER**

**ROC AUC CURVE**

In [None]:
fig = plt.figure(figsize = (8,15))                  # Changed the size of the figure, just aesthetic
ax1 = fig.add_subplot(1,2,1)     # Change the subplot arguments
plot_multiclass_roc(logreg, X_val, y_val, n_classes=4, figsize=(5, 5),ax = ax1)
ax2 = fig.add_subplot(1,2,2)                        # Change the subplot arguments
plot_multiclass_roc(logreg_smote, X_val, y_val, n_classes=4, figsize=(5, 5),ax = ax2)
ax1.set_title("The Receiver operating characteristics of the Logistic Regression classifier with no SMOTE")
ax2.set_title("The Receiver operating characteristics of the Logistic Regression classifier with SMOTE")
plt.gcf().set_size_inches(20, 10)

**CLASS PREDICTION ERROR** 

In [None]:
from yellowbrick.classifier import  ClassPredictionError
# Instantiate the classification model and visualizer
logreg_class = ClassPredictionError(
    logreg, classes=[-1,0,1,2]
)

# Fit the training data to the visualizer
logreg_class.fit(X_train, y_train)

# Evaluate the model on the test data
logreg_class.score(X_val, y_val)

# Draw visualization
logreg_class.show()
logreg_class_smote = ClassPredictionError(
    logreg_smote, classes=[-1,0,1,2]
)

# Fit the training data to the visualizer
logreg_class_smote.fit(X_train, y_train)

# Evaluate the model on the test data
logreg_class_smote.score(X_val, y_val)

# Draw visualization
logreg_class_smote.show()


plt.show()

**SUPPORT VECTOR MACHINE**

**CLASS PREDICTION ERROR**

In [None]:
# Instantiate the classification model and visualizer
SVM_class = ClassPredictionError(
    svm_clf, classes=[-1,0,1,2]
)

# Fit the training data to the visualizer
SVM_class.fit(X_train, y_train)

# Evaluate the model on the test data
SVM_class.score(X_val, y_val)

# Draw visualization
SVM_class.show()

# Instantiate the classification model and visualizer
SVM_class_smote = ClassPredictionError(
    svm_clf_CV_smote, classes=[-1,0,1,2]
)

# Fit the training data to the visualizer
SVM_class_smote.fit(X_train, y_train)

# Evaluate the model on the test data
SVM_class_smote.score(X_val, y_val)

# Draw visualization
SVM_class_smote.show()


plt.show()

**Naive Bayes Multinomial NB**

In [None]:
fig3 = plt.figure(figsize = (8,15))                  
ax5 = fig3.add_subplot(1,2,1)     # Change the subplot arguments
plot_multiclass_roc(nb, X_val, y_val, n_classes=4, figsize=(5, 5),ax = ax5)
ax6 = fig3.add_subplot(1,2,2)                        # Change the subplot arguments
plot_multiclass_roc(nb_smote, X_val, y_val, n_classes=4, figsize=(5, 5),ax = ax6) # Call the plot function
ax5.set_title("The Receiver operating characteristics of the Naive Bayes Multinomial NB classifier with no SMOTE")
ax6.set_title("The Receiver operating characteristics of the Naive Bayes Multinomial NB classifier with SMOTE")
 
plt.gcf().set_size_inches(20, 10)

In [None]:
# Instantiate the classification model and visualizer
NB_class = ClassPredictionError(
    nb, classes=[-1,0,1,2]
)

# Fit the training data to the visualizer
NB_class.fit(X_train, y_train)

# Evaluate the model on the test data
NB_class.score(X_val, y_val)

# Draw visualization
NB_class.show()
NB_class_smote = ClassPredictionError(
    nb_smote, classes=[-1,0,1,2]
)

# Fit the training data to the visualizer
NB_class_smote.fit(X_train, y_train)

# Evaluate the model on the test data
NB_class_smote.score(X_val, y_val)

# Draw visualization
NB_class_smote.show()


plt.show()

### 9.2. Overiew findings 

In [None]:
* The Logistic Regression performed well compared to its other counterpart producing an f1_score of 0.74 as well as the accuracy score of 0.75.
* Reasons:
* The data was linearly separable even though it was multiclassed. 
* Logistic Regression Performs optimally when classified the data is balanced and when there is two classes and assumes that the data is linearly separable.
* The use of the one vs rest technique allowed to do the classification more simply