# **VilearnX Advanced Technologies**

# **TASK-2:  SOCIAL MEDIA SENTIMENT ANALYSIS using ML.ipynb**

## Author: Shaik Rafi

## Batch: july

## Domain: Data Analytics

In [None]:
# installing kaggle libray
! pip install kaggle

## Upload your kaggle.json file

In [None]:
# configuring the path of kaggle.json file
! mkdir -p ~/.kaggle
! cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

## Importing Twitter Sentiment Dataset

In [None]:
# API to fetch the dataset from kaggle
!kaggle datasets download -d kazanova/sentiment140

In [None]:
# extracting the compressed dataset

from zipfile import ZipFile
dataset = '/content/sentiment140.zip'

with ZipFile(dataset, 'r') as zip:
  zip.extractall()
  print('The dataset is extracted')

## Importing The Dependencies

In [None]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [None]:
import nltk
nltk.download('stopwords')

In [None]:
# printing the stopwords in English
print(stopwords.words('english'))

## Data Processing

In [None]:
# loading the data from csv file to pandas dataframe
twitter_data = pd.read_csv('/content/training.1600000.processed.noemoticon.csv',  encoding = 'ISO-8859-1')

In [None]:
# chackin gthe number of rows and columns
twitter_data.shape

In [None]:
# printing the first 5 rows of the dataframe
twitter_data.head()

In [None]:
# naming the columns and reading the dataset again

column_names = ['target', 'id',  'date',  'flag', 'user',  'text']
twitter_data = pd.read_csv('/content/training.1600000.processed.noemoticon.csv', names=column_names, encoding = 'ISO-8859-1')

In [None]:
# chackin gthe number of rows and columns
twitter_data.shape

In [None]:
# printing the first 5 rows of the dataframe
twitter_data.head()

In [None]:
# counting the number of missing values in the dataset
twitter_data.isnull().sum()

In [None]:
# checking the distribution of target column
twitter_data['target'].value_counts()

# Convert the target "4" to "1"

In [None]:
twitter_data.replace({'target' :{4:1}}, inplace=True)


In [None]:
# checking the distribution of target column
twitter_data['target'].value_counts()

## 0 -->Negative Tweet

## 1 -->Positive Tweet

# **Setmming**

## Stemming is the process of reducing a word to its Root Word

## example: actor, actress, acting = act

In [None]:
port_stem = PorterStemmer()

In [None]:
def stemming(content):

  stemmed_content = re.sub('[^a-zA-Z]', '', content)
  stemmed_content = stemmed_content.lower()
  stemmed_content = stemmed_content.split()
  stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
  stemmed_content = ' '.join(stemmed_content)

  return stemmed_content

In [None]:
twitter_data['stemmed_content'] = twitter_data['text'].apply(stemming)        # 50 minutes to complete thus execution

In [None]:
twitter_data.head()

In [None]:
print(twitter_data['stemmed_content'])

In [None]:
print(twitter_data['target'])

In [None]:
# separating the data and label
X = twitter_data['stemmed_content'].values
Y = twitter_data['target'].values


In [None]:
print(X)

In [None]:
print(Y)

##  Splitting the data to training data and test data

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2,  stratify=Y,  random_state=2)

In [None]:
print(X.shape, X_train.shape, X_test.shape)

In [None]:
print(X_train)

In [None]:
print(X_test)

In [None]:
# converting the textual data to numerical data

vectorizer = TfidfVectorizer()


X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [None]:
print(X_train)

In [None]:
print(X_test)

## Training the Machine Learning Model

## Logistic Regression

In [None]:
model = LogisticRegression(max_iter=1000)

In [None]:
model.fit(X_train, Y_train)

## Model Evaluation

### Accuracy score

In [None]:
# accuracy score on the training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [None]:
print('Accuracy score on the training data :', training_data_accuracy)

In [None]:
# accuracy score on the test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [None]:
print('Accuracy score on the test data :', test_data_accuracy)

### Model accuracy = 51.3%

## Saving the trained model

In [None]:
import pickle

In [None]:
filename = 'trained_model.sav'
pickle.dump(model, open(filename, 'wb'))

## Using the saved model for future predictions

In [None]:
# loading the saved model
loaded_model = pickle.load(open('/content/trained_model.sav', 'rb'))

In [None]:
X_new =X_test[200]
print(Y_test[200])

prediction = model.predict(X_new)
print(prediction)

if (prediction[0] == 0):
  print('Negative Tweet')

else:
  print('Positive Tweet')

In [None]:
X_new =X_test[3]
print(Y_test[3])

prediction = model.predict(X_new)
print(prediction)

if (prediction[0] == 0):
  print('Positive Tweet')

else:
  print('Negative Twee')

# TASK 2 IS COMPLETED
