# **Load libraries**

In [None]:
from google.colab import files
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


# **Load file**

In [None]:
file1 = files.upload()

TypeError: 'NoneType' object is not subscriptable

In [None]:
twitter_data = pd.read_csv('twitter.csv',encoding = 'ISO-8859-1')

# **download stopwords**

In [None]:
import nltk
nltk.download('stopwords')

#these words are important while analysing meaning in a sentence , when remove from the sentence,a s these add no actuall meaning to the sentence
print(stopwords.words('english'))

# **Data processing**

In [None]:
twitter_data.head()

In [None]:
#naming the column and reading dataset again

column_names = ['target', 'id', 'date', 'flag', ' user', 'text']

In [None]:
twitter_data = pd.read_csv('twitter.csv',names = column_names ,encoding = 'ISO-8859-1')

In [None]:
#counting missing values in the dataset
twitter_data.isnull().sum()

In [None]:
# checking distribution of "target" column , that is distinct values for
# negative,positive and neutral comments as 0,4,and 2 resp.
twitter_data['target'].value_counts()

In [None]:
#convert target label from "4" to "1"
twitter_data.replace({'target':{4:1}}, inplace = True)

In [None]:
#now check again, so it should be 0 : negative tweet and 1 : positive tweet
twitter_data['target'].value_counts()

# **Stemming**

In [None]:
port_stem = PorterStemmer()

In [None]:
def stemming(content):
  stemmed_content = re.sub('[^a-zA-Z]',' ',content) #remove anything other than in alpahabet
  stemmed_content = stemmed_content.lower() #then convert to lower case
  stemmed_content = stemmed_content.split() #then split the words into a list
  stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')] #
  #stem the words now that is remove stop words like ('is','the','and',etc..) and also reduce words to their base form like ('running'->'run')
  stemmed_content = ' '.join(stemmed_content) #finally join the list of words in a single sentence with spaces, to get back the tweet but cleaned for the model
  return stemmed_content



In [None]:
#this takes depending upon the size of the data
twtter_data['stemmed_content'] = twitter_data['text'].apply(stemming)

In [None]:
print(twitter_data['target'])

In [None]:
#seperating the data and label
x = twitter_data['stemmed_content'].values
y = twitter_data['target'].values

# **Splitting data into training and testing**

In [None]:
# now here we split the data into training and testing that is x_train with y_train and x_test with y_test
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2, stratify = y, random_state = 2)

In [None]:
print(x_train)

In [None]:
print(x_test)

In [None]:
 #converting text data to numerical data (TFIDF) so the machine can understand the text (meaning)
 #based on the repition of each word it gives it a importance in vectors
 vectorizer = TfidfVectorizer()
 x_train = vectorizer.fit_transform(x_train)
 x_test = vectorizer.transform(x_test)

In [None]:
print(x_train)

In [None]:
print(x_test)

# **Logistic Regression training**

In [None]:
model = LogisticRegression(max_ter = 1000)

model.fit(x_train,y_train)

# **Model evalutaion**

In [None]:
# accuracy on training data
x_train_prediction = model.predict(x_train)
training_data_accuracy = accuracy_score(x_train_prediction,y_train)

In [None]:
print('accuracy score for training data:',training_data_accuracy)

In [None]:
# accuracy on test data
x_test_prediction = model.predict(x_test)
test_data_accuracy = accuracy_score(y_test,x_test_prediction)

In [None]:
print('accuracy score for test data :',test_data_accuracy)

# **Saving the trained model**

In [None]:
import pickle

In [None]:
filename = 'trained_model.sav'
pickle.dump(model,open(filename,'wb'))

In [None]:
# using the saved model
load_model = pickle.load(open(filename,'rb'))

In [None]:
x_nex = x_test[200]
print(y_test[200])

In [None]:
prediction = load_model.predict(x_nex)
print(prediction)

if (prediction[0] == 0):
  print('Negative tweet')

else:
  print('Positive tweet')