In [1]:
import pandas as pd
import numpy as np
import nltk
import string
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

from sklearn.metrics import f1_score

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# **Load in my data from kaggle.**

In [2]:
train = pd.read_csv('../input/climate-change-edsa2020-21/train.csv')
test = pd.read_csv('../input/climate-change-edsa2020-21/test.csv')

In [3]:
train.sentiment.value_counts()

 1    8530
 2    3640
 0    2353
-1    1296
Name: sentiment, dtype: int64

# Splitting out the X variable from the target****

In [4]:
y = train['sentiment']
X = train['message']

> > *Removing special characters and normalize certain words*

In [5]:
# init stemmer
from nltk.stem import PorterStemmer
po_stemmer= PorterStemmer()

def preprocessor(txt):
    
    txt=txt.lower() 
    txt=re.sub("\\W"," ",txt)
    txt=re.sub("\\s+(in|the|all|for|and|on)\\s+"," _connector_ ",txt)
    
    # stem words
    final_words=re.split("\\s+",txt)
    s_words=[po_stemmer.stem(word=word) for word in final_words]
    return ' '.join(s_words)

> > **Creating space between special characters and removing special characters****


In [6]:
def tkenizer(txt):
    txt=re.sub("(\\W)"," \\1 ",txt)
    return re.split("\\s+",txt)

# Turning text into something a model can read

In [7]:

vectorizer = TfidfVectorizer(ngram_range=(1,2),tokenizer=tkenizer,smooth_idf=False, min_df=2,max_df=0.80,analyzer='word', preprocessor=preprocessor,stop_words="english")
X_vectorized = vectorizer.fit_transform(X)

  'stop_words.' % sorted(inconsistent))


# **Splitting the training data into a training and validation set**

In [8]:
X_train,X_val,y_train,y_val = train_test_split(X_vectorized,y,test_size=.3,shuffle=True, stratify=y, random_state=11)

# **Training the model and evaluating using the validation set**

In [9]:
lisvc = LinearSVC()
lisvc.fit(X_train, y_train)
lisvc_pred = lisvc.predict(X_val)

# **Checking the performance of our model on the validation set**


In [10]:
f1_score(y_val, lisvc_pred, average="macro")

0.6445753117945924

In [11]:
from sklearn import metrics

print(metrics.classification_report(y_val, lisvc_pred))


              precision    recall  f1-score   support

          -1       0.69      0.42      0.53       389
           0       0.57      0.42      0.48       706
           1       0.76      0.86      0.81      2559
           2       0.77      0.75      0.76      1092

    accuracy                           0.74      4746
   macro avg       0.70      0.61      0.64      4746
weighted avg       0.73      0.74      0.73      4746



# **Getting our test set ready**


In [12]:
x_test = test['message']
test_vect = vectorizer.transform(x_test)

# **Making predictions on the test set and adding a sentiment column to our original test dataframe**

In [13]:
y_pred = lisvc.predict(test_vect)

In [14]:
test['sentiment'] = y_pred

In [15]:
test.head()

Unnamed: 0,message,tweetid,sentiment
0,Europe will now be looking to China to make su...,169760,1
1,Combine this with the polling of staffers re c...,35326,0
2,"The scary, unimpeachable evidence that climate...",224985,1
3,@Karoli @morgfair @OsborneInk @dailykos \nPuti...,476263,1
4,RT @FakeWillMoore: 'Female orgasms cause globa...,872928,0


# **Creating an output csv for submission**

In [16]:
test[['tweetid','sentiment']].to_csv('test_submission.csv', index=False)