In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebrca
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**Importing the necessary libraries**

In [None]:
import numpy as np
import pandas as pd
import re   #pattern matching or search through
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
! pip install tqdm

In [None]:
from tqdm.notebook import tqdm

In [None]:
import nltk
nltk.download('stopwords')

In [None]:
print(stopwords.words('english'))

**Data Processing**

In [None]:
#read the dataset
df = pd.read_csv("/kaggle/input/twittesentimentanalysis/tweets.csv",encoding = 'ISO-8859-1')

In [None]:
# check the number of rows and columns
df.shape

In [None]:
#print top rows
df.head()

In [None]:
#name the columns
col_names = ['target','id','date','flag','user','text']
df = pd.read_csv("/kaggle/input/twittesentimentanalysis/tweets.csv",names = col_names, encoding = 'ISO-8859-1')

In [None]:
df.shape

In [None]:
df.head()

In [None]:
# counting the number of missing values 
df.isnull().sum()

In [None]:
#check the distribution of target columns
df['target'].value_counts()

In [None]:
#converting target from 4 to 1
df.replace({'target':{4:1}},inplace=True)

**Stemming**

In [None]:
port_stem = PorterStemmer()

def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]', ' ', content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content]
    stemmed_content = ' '.join(stemmed_content)
    
    return stemmed_content

tqdm.pandas()  # Enable tqdm progress_apply
df['stemmed_content'] = df['text'].progress_apply(stemming)

In [None]:
df.head()

In [None]:
print(df['stemmed_content'])

In [None]:
#separating data and label
X = df['stemmed_content'].values
y = df['target'].values

In [None]:
print(X)

In [None]:
print(y)

In [None]:
X.shape

**Splitting into training and test data**

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,stratify=y,random_state=43)
#stratify = equal distribution of 1 and 0 in test data as well

In [None]:
print(X_train.shape,X_test.shape)

In [None]:
X_test

**Converting Textual to numerical data**

In [None]:
vectorizer = TfidfVectorizer()
# train test needs to be fit and transform where as test data is only tensformed
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)


In [None]:
print(X_train)

In [None]:
print(X_test)

**Train the Macine Learning Model**
Logistic Regression

In [None]:
model = LogisticRegression(max_iter = 1000)

In [None]:
print(X_train.shape,y_train.shape)

In [None]:
model.fit(X_train,y_train)

**Model Evaluation**

In [None]:
#accuracy score
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(y_train,X_train_prediction)

In [None]:
print('Accuracy score in the training data : ', training_data_accuracy)

In [None]:
#accuracy score on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(y_test,X_test_prediction)

In [None]:
print('Accuracy score in the test data by Logistic Regression: ', test_data_accuracy)

**Using model as Naive Bayes**

In [None]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X_train,y_train)
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(y_test,X_test_prediction)
print('Accuracy score in the test data by Naive Bayes : ', test_data_accuracy)

**Confusion_Matrix**

In [None]:
cm = confusion_matrix(y_test,X_test_prediction)
plt.figure(figsize=(5,4))
sns.heatmap(cm,annot=True,fmt='d')
plt.title('Test Confusion Matrix')
plt.show()

**Saving this tained model**

In [None]:
import pickle


In [None]:
filename = 'trained_model.sav'
pickle.dump(model,open(filename,'wb'))#r-only read, w-write and b-binary form


**Using the saved model**

In [None]:
#loading the saved model
loaded_model = pickle.load(open('/kaggle/working/trained_model.sav','rb'))


In [None]:
X_new =X_test[200]
print(y_test[200])
prediction = loaded_model.predict(X_new)
print(prediction)

if(prediction[0] == 0):
    print('Negative tweet')
else:
    print('Positive tweet')

In [None]:
X_new =X_test[34]
print(y_test[34])
prediction = loaded_model.predict(X_new)
print(prediction)

if(prediction[0] == 0):
    print('Negative tweet')
else:
    print('Positive tweet')