## Import libraries

In [168]:
import pandas as pd
import numpy as np
import nltk
import string
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from catboost import Pool, CatBoostClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import LinearSVC
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.corpus import wordnet

from gensim.parsing.preprocessing import remove_stopwords
from sklearn.metrics import f1_score

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

## Importing the Data

In [169]:
train =  pd.read_csv('../input/climate-change-edsa2020-21/train.csv')
test = pd.read_csv('../input/climate-change-edsa2020-21/test.csv')

## Train Data

In [170]:
train.head()

Unnamed: 0,sentiment,message,tweetid
0,1,PolySciMajor EPA chief doesn't think carbon di...,625221
1,1,It's not like we lack evidence of anthropogeni...,126103
2,2,RT @RawStory: Researchers say we have three ye...,698562
3,1,#TodayinMaker# WIRED : 2016 was a pivotal year...,573736
4,1,"RT @SoyNovioDeTodas: It's 2016, and a racist, ...",466954


In [171]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15819 entries, 0 to 15818
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   sentiment  15819 non-null  int64 
 1   message    15819 non-null  object
 2   tweetid    15819 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 370.9+ KB


In [172]:
train.sentiment.value_counts()

 1    8530
 2    3640
 0    2353
-1    1296
Name: sentiment, dtype: int64

### Text Cleaning
Removing the noise in the data.

In [173]:
train['message'] = train['message'].str.lower()
test['message'] = test['message'].str.lower()


In [174]:
stemmer=PorterStemmer()

def remove_pattern(input_txt):
    input_txt = re.sub("\\W"," ",input_txt)
    input_txt = re.sub("\\s+(in|the|all|for|and|on)\\s+"," ",input_txt)
    
    
    # stem words
    words = re.split("\\s+",input_txt)
    stemmed_words = [stemmer.stem(word=word) for word in words]
    
    return ' '.join(stemmed_words)

In [175]:
def tokenizer(input_txt):
    text=re.sub("(\\W)"," \\1 ",input_txt)

    # split based on whitespace
    return re.split("\\s+",input_txt)

In [176]:
train.head()

Unnamed: 0,sentiment,message,tweetid
0,1,polyscimajor epa chief doesn't think carbon di...,625221
1,1,it's not like we lack evidence of anthropogeni...,126103
2,2,rt @rawstory: researchers say we have three ye...,698562
3,1,#todayinmaker# wired : 2016 was a pivotal year...,573736
4,1,"rt @soynoviodetodas: it's 2016, and a racist, ...",466954


In [None]:
from nltk.corpus import stopwords
import re 
stop = stopwords.words('english')
remove_words = ['rt']

# Exclude stopwords with Python's list comprehension and pandas.DataFrame.apply.
train["message"] = train["message"].apply(lambda x: ' '.join([word for word in x.split() if word[0]!='#' and word[0]!='@'])) #Remove # and @
train['message'] = train['message'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))            #Remove stopwords
train['message'] = train['message'].apply(lambda x: ' '.join([word for word in x.split() if word not in (remove_words)]))    #Remove additional words

test["message"] = test["message"].apply(lambda x: ' '.join([word for word in x.split() if word[0]!='#' and word[0]!='@']))
test['message'] = test['message'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
test['message'] = test['message'].apply(lambda x: ' '.join([word for word in x.split() if word not in (remove_words)]))

In [177]:
train.head()

Unnamed: 0,sentiment,message,tweetid
0,1,polyscimajor epa chief doesn't think carbon di...,625221
1,1,it's not like we lack evidence of anthropogeni...,126103
2,2,rt @rawstory: researchers say we have three ye...,698562
3,1,#todayinmaker# wired : 2016 was a pivotal year...,573736
4,1,"rt @soynoviodetodas: it's 2016, and a racist, ...",466954


### Spliting the training data into X & Y variables
We split the training data into the X variables which will be the message(tweets that are tweeted out) and Y variables which will be the sentiment(If a person tweeting believes in Climate Change or not).

Then we transform out data into numbers using CountVectorizer into a language that the computer can easily understand.

In [178]:
y = train['sentiment']
X = train['message']

In [179]:
vectorizer = TfidfVectorizer(ngram_range=(1,2),tokenizer=tokenizer, min_df=2,max_df=0.70,analyzer='word',smooth_idf=False, preprocessor=remove_pattern ,stop_words="english")
X_cnt_vectorized = vectorizer.fit_transform(X)

  'stop_words.' % sorted(inconsistent))


### Perfomance validation for train

In [185]:
X_train,X_test,y_train,y_test = train_test_split(X_cnt_vectorized,y,test_size=0.08,shuffle=True, random_state=23)

### Training the different train data models and F1 Score

#### Random Forest

In [186]:
rfc = RandomForestClassifier(n_estimators=100, random_state=42)
rfc.fit(X_train, y_train)
rfc_cnt_pred = rfc.predict(X_test)

In [187]:
f1_score(y_test, rfc_cnt_pred, average="macro")

0.5667321767416644

#### SVM Linear Classifier Train

In [188]:
from sklearn import svm

svm_lin = svm.LinearSVC()
svm_lin.fit(X_train, y_train)
svm_cnt_pred = svm_lin.predict(X_test)

In [189]:
f1_score(y_test, svm_cnt_pred, average="macro")

0.6683787033069274

#### SGD Classifier Train

In [190]:
from sklearn.linear_model import SGDClassifier

sgd = SGDClassifier()
sgd.fit(X_train, y_train)
sgd_cnt_pred = sgd.predict(X_test)

In [191]:
f1_score(y_test, sgd_cnt_pred, average="macro")

0.6317591801921397

#### Linear Regression Train

In [192]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(random_state = 23)
lr.fit(X_train, y_train)
lr_cnt_pred = lr.predict(X_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [193]:
f1_score(y_test, lr_cnt_pred, average="macro")

0.6088427405216152

## Test Data

### Spliting the training data into X & Y variables

In [194]:
test_X = test['message']
test_cnt_vect = vectorizer.transform(test_X)

### Testing the test model data and F1 Score

#### Random Forest Test

In [195]:
y_cnt_pred = rfc.predict(test_cnt_vect)

#### SVM Linear Classifier Train

In [196]:
y_cnt_pred = svm_lin.predict(test_cnt_vect)

#### SGD Classifier Train

In [None]:
y_cnt_pred = sgd.predict(test_cnt_vect)

#### Logistic Regression Test

In [None]:
y_cnt_pred = lr.predict(test_cnt_vect)

### Final Test predict

In [197]:
test['sentiment'] = y_cnt_pred

In [198]:
test.head()

Unnamed: 0,message,tweetid,sentiment
0,europe will now be looking to china to make su...,169760,1
1,combine this with the polling of staffers re c...,35326,1
2,"the scary, unimpeachable evidence that climate...",224985,1
3,@karoli @morgfair @osborneink @dailykos \nputi...,476263,1
4,rt @fakewillmoore: 'female orgasms cause globa...,872928,0


## Creating an output csv for submission

In [161]:
test[['tweetid','sentiment']].to_csv('testsubmission30.csv', index=False)