## Installing Essential Libraries

In [None]:

! pip install kaggle
! pip install wordcloud

: 

## Libraries

In [91]:
from zipfile import ZipFile
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score ,classification_report
import pickle

In [None]:
! mkdir -p ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json

## Twitter Sentiment Dataset

In [None]:
! kaggle datasets download -d kazanova/sentiment140

In [5]:
#unzip dataset
dataset_path='/content/sentiment140.zip'
with ZipFile(dataset_path,'r') as z:
  z.extractall()

In [None]:
nltk.download('stopwords')

In [None]:
#printing stopwords
# These are the words which donot have any important meaning in data
words = stopwords.words('english')
print( words)

## Data Processing

In [8]:
tweets_df = pd.read_csv('/content/training.1600000.processed.noemoticon.csv',encoding='ISO-8859-1')

In [None]:
tweets_df.head()

In [None]:
 # no of rows and columns
tweets_df.shape

In [11]:
# Ading columns to dataframe
columns =['target','ids','date','flag','user','text']
tweets_df = pd.read_csv('/content/training.1600000.processed.noemoticon.csv',names = columns ,encoding='ISO-8859-1')

In [None]:
tweets_df.head()

In [None]:
# Analyzing missing valuues
tweets_df.isnull().sum()

In [None]:
# Didtribution of target column
tweets_df['target'].value_counts()



## Converting labels into numbers
0 means negative
1 means positive

In [15]:
tweets_df2 = tweets_df.replace({'target':{4:1}})

In [None]:
tweets_df2['target'].value_counts()

## Stemming Process
It reduce  a word to its root word

In [17]:
porter_stemmer = PorterStemmer()

In [18]:
def stemming(data):
  stemmed_data = re.sub('[^a-zA-Z]',' ' , data)
  stemmed_data = stemmed_data.lower()
  stemmed_data = stemmed_data.split()
  english_stopwords = stopwords.words('english')
  stemmed_data = [porter_stemmer.stem(word) for word in stemmed_data if word not in english_stopwords]
  stemmed_data = ' '.join(stemmed_data)
  return stemmed_data

In [None]:
tweets_df2['stemmed_tweets'] = tweets_df2['text'].apply(stemming)
tweets_df2.head()

In [96]:
# Seperating data and label
x = tweets_df2['stemmed_tweets'].values
y = tweets_df2['target'].values

In [None]:
print(x)

In [None]:
print(y)

In [98]:
# for genearting word cloud we need to convert list into single string
text = ' '.join(x)

## Word Cloud

In [100]:
wordcloud = WordCloud(width=800,height=400,background_color='white',colormap='plasma',max_words=100).generate(text)

In [None]:
plt.figure(figsize=(10,5))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()

## Splitting data into training and test data

In [38]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,stratify=y,random_state=2)

In [None]:
print(x_train.shape,x_test.shape,y_train.shape,y_test.shape)

In [None]:
print(x_train)

## Converting textual data into numerical data

In [40]:
vector  = TfidfVectorizer()
x_train = vector.fit_transform(x_train)
x_test = vector.transform(x_test)

In [None]:
print(x_train)

In [None]:
x_train.shape,x_test.shape

## Training model


In [49]:
#Logistic regression
iterations = 2000
model = LogisticRegression(max_iter = iterations)

In [None]:
model.fit(x_train,y_train)

## Model Evaluation

In [51]:
# Accuracy score on training data
training_predictions = model.predict(x_train)
training_accuracy = accuracy_score(y_train,
training_predictions)

In [None]:
print(f'training accuracy : {training_accuracy*100}')

In [53]:
# Accuracy Score on test Data
test_predictions = model.predict(x_test)
test_accuracy = accuracy_score(y_test,test_predictions)

In [None]:
print(f'Test Accuracy : {test_accuracy*100}')

In [57]:
# Classification Report
report = classification_report(y_test,test_predictions)

In [None]:
print(report)

## Saving Trained Model


In [60]:
name = 'model.pkl'
pickle.dump(model,open(name,'wb'))

## Loading model for Future Predictions

In [61]:
load_model = pickle.load(open('/content/model.pkl','rb'))

In [86]:
  #creating new sample of data
x = x_test[:5]


In [87]:
predictions = model.predict(x)

In [None]:
a = len(predictions)
a

In [None]:
for i in range(a):
  if predictions[i] == 0:
    print('Negative Tweet')
  else:
    print('Positive Tweet')