In [1]:
import pandas as pd

In [2]:
import chardet

In [3]:
with open('global_warming (1).csv', 'rb') as f:
    encoding = chardet.detect(f.read())['encoding']

In [4]:
df = pd.read_csv('global_warming (1).csv',encoding=encoding)

In [5]:
df.shape

(4225, 2)

In [6]:
# Importing essential libraries for performing Natural Language Processing on 'Restaurant_Reviews.tsv' dataset
import nltk
import re
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Premalatha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
# Cleaning the reviews
corpus = []
for i in range(0,4225):

  # Cleaning special character from the reviews
  review = re.sub(pattern='[^a-zA-Z]',repl=' ', string=df['tweet'][i])

  # Converting the entire review into lower case
  review = review.lower()

  # Tokenizing the review by words
  review_words = review.split()

  # Removing the stop words
  review_words = [word for word in review_words if not word in set(stopwords.words('english'))]

  # Stemming the words
  ps = PorterStemmer()
  review = [ps.stem(word) for word in review_words]

  # Joining the stemmed words
  review = ' '.join(review)

  # Creating a corpus
  corpus.append(review)

In [8]:
#Before Preprocessing
df.sample(15)

Unnamed: 0,tweet,existence
1784,RT @rorycooper Seriously? Reaching. RT @Drudg...,1
3680,".@LuvableSole 12-20"" what?! effing global warm...",0
2898,my pc's a hand crank! .@mrdannyglover i suppor...,1
3564,Busted! Rajendra Pachauri: head of UN climate ...,0
3200,Proof there's no climate change. RT @jazgar 75...,0
4074,@vballr4life7 too whiche lie or distortion of ...,0
3276,New post: Cloudy with a Chance of Climate Chan...,0
2352,http://bit.ly/9m2Qfp global warming solutions ...,1
2609,global warming http://bit.ly/dxIaFb ;P Study S...,1
2717,Impacts of climate change and how native peopl...,1


In [9]:
#After Preprocessing
corpus[0:10]

['global warm report urg govern act brussel belgium ap world face increas hunger link',
 'fight poverti global warm africa link',
 'carbon offset vatican forest fail reduc global warm link',
 'carbon offset vatican forest fail reduc global warm link',
 'uruguay tool need vulner climat chang link',
 'rt sejorg rt jaymiheimbuch ocean salti show global warm intensifi water cycl link',
 'global warm evid around us messag global warm denier doubter look around link',
 'migratori bird new climat chang strategi stay home link',
 'southern africa compet limpopo water climat chang bring higher temperatur south link',
 'global warm impact wheat rice product india ludhiana apr scarciti water seriou link']

In [10]:
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=4500)
X = cv.fit_transform(corpus).toarray()
y = df.iloc[:,1].values

In [11]:
X.shape

(4225, 4500)

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [13]:
# Fitting Naive Bayes to the Training set
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

In [14]:
# Predicting the Test set results
y_pred = classifier.predict(X_test)

In [15]:
# Accuracy, Precision and Recall
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
score1 = accuracy_score(y_test,y_pred)
score2 = precision_score(y_test,y_pred)
score3= recall_score(y_test,y_pred)
print("---- Scores ----")
print("Accuracy score is: {}%".format(round(score1*100,2)))
print("Precision score is: {}".format(round(score2,2)))
print("Recall score is: {}".format(round(score3,2)))

---- Scores ----
Accuracy score is: 82.84%
Precision score is: 0.89
Recall score is: 0.87


In [16]:
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm

array([[157,  67],
       [ 78, 543]], dtype=int64)

In [None]:
# Plotting the confusion matrix
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Assuming you have already computed the confusion matrix and stored it in 'cm'
plt.figure(figsize=(6, 6))
sns.heatmap(cm, annot=True, cmap="YlGnBu", xticklabels=['Negative', 'Positive'], yticklabels=['Negative', 'Positive'], 
            cbar_kws={'label': 'Frequency'})
plt.xlabel('Predicted values')
plt.ylabel('Actual values')
plt.tight_layout()  # Adjust layout to prevent cutoff of labels
plt.show()

In [None]:
import numpy as np

In [None]:
# Hyperparameter tuning the Naive Bayes Classifier
best_accuracy = 0.0
alpha_val = 0.0
for i in np.arange(0.1,1.1,0.1):
  temp_classifier = MultinomialNB(alpha=i)
  temp_classifier.fit(X_train, y_train)
  temp_y_pred = temp_classifier.predict(X_test)
  score = accuracy_score(y_test, temp_y_pred)
  print("Accuracy score for alpha={} is: {}%".format(round(i,1), round(score*100,2)))
  if score>best_accuracy:
    best_accuracy = score
    alpha_val = i
print('--------------------------------------------')

In [None]:
classifier = MultinomialNB(alpha=0.2)
classifier.fit(X_train, y_train)

In [None]:
def predict_sentiment(sample_review):
  sample_review = re.sub(pattern='[^a-zA-Z]',repl=' ', string = sample_review)
  sample_review = sample_review.lower()
  sample_review_words = sample_review.split()
  sample_review_words = [word for word in sample_review_words if not word in set(stopwords.words('english'))]
  ps = PorterStemmer()
  final_review = [ps.stem(word) for word in sample_review_words]
  final_review = ' '.join(final_review)

  temp = cv.transform([final_review]).toarray()
  return classifier.predict(temp)

In [None]:
# Predicting values
sample_review = 'its nearly 37 degrees hot in here'

if predict_sentiment(sample_review):
  print('This is a POSITIVE review.')
else:
  print('This is a NEGATIVE review!')