## Классификация полярности твитов 

Датасет: http://help.sentiment140.com/for-students/

In [1]:
import pandas as pd
import numpy as np
import re
import nltk

from nltk.tokenize import RegexpTokenizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [2]:
df = pd.read_csv('data.csv', encoding='ansi')

df = df.drop(columns=['id', 'date', 'query', 'user'])
df.polarity = df.polarity.replace(4,1)

In [3]:
df.head()

Unnamed: 0,polarity,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


In [4]:
dataset = df

In [5]:
dataset.text = dataset.text.str.lower()

Уберем из твитов упоминания (которые начинаются на '@')

In [6]:
def clean_mentions(data):
    return re.sub('@\w+',' ',data)
dataset.text = dataset.text.apply(clean_mentions)

Уберем ссылки:

In [7]:
def clean_URLs(data):
    return re.sub('((www.[^\s]+)|(https?://[^\s]+))',' ',data)

dataset.text = dataset.text.apply(clean_URLs)

Уберем знаки препинания:

In [8]:
punctuations = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
def clean_punctuations(data):
    translator = str.maketrans('', '', punctuations)
    return data.translate(translator)

dataset.text = dataset.text.apply(clean_punctuations)

Уберем цифры:

In [9]:
def clean_numbers(data):
    return re.sub('[0-9]+', '', data)
dataset.text = dataset.text.apply(clean_numbers)

Токенизируем, применим стемминг:

In [10]:
tokenizer = RegexpTokenizer(r'\w+')
dataset.text = dataset.text.apply(tokenizer.tokenize)

In [11]:
stemmer = nltk.PorterStemmer()
def stem(data):
    return [stemmer.stem(word) for word in data]

dataset.text = dataset.text.apply(stem)

Склеим токены обратно:

In [12]:
def join_back(data):
    return ' '. join(data)

dataset.text = dataset.text.apply(join_back)

Извлечем обучающую и тестовую выборку:

In [13]:
X = dataset.text
y = dataset.polarity

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1)

Извлечем признаки с помощью count bag of n-grams с последующим tf-idf преобразованием. 

Все это делается с помощью TfidfVectorizer

In [15]:
vectoriser = TfidfVectorizer(ngram_range=(1,2), max_features=500000)
vectoriser.fit(X_train)

TfidfVectorizer(max_features=500000, ngram_range=(1, 2))

In [16]:
X_train = vectoriser.transform(X_train)
X_test  = vectoriser.transform(X_test)

Обучим наивный байесовский классификатор:

In [17]:
naive_bayes = BernoulliNB()
naive_bayes.fit(X_train, y_train)

BernoulliNB()

Результат на тестовой выборке:

In [18]:
y_pred = naive_bayes.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.81      0.78      0.79     79969
           1       0.79      0.81      0.80     80031

    accuracy                           0.80    160000
   macro avg       0.80      0.80      0.80    160000
weighted avg       0.80      0.80      0.80    160000



Обучим классификатор на основе логистической регрессии:

In [19]:
linear_regression = LogisticRegression(max_iter = 1000)
linear_regression.fit(X_train, y_train)

LogisticRegression(max_iter=1000)

Результат на тестовой выборке:

In [20]:
y_pred = linear_regression.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.81      0.82     79969
           1       0.82      0.83      0.82     80031

    accuracy                           0.82    160000
   macro avg       0.82      0.82      0.82    160000
weighted avg       0.82      0.82      0.82    160000

