# Версия 1 (0.77). Базовые модели

In [1]:
import pandas as pd
import numpy as np
import warnings

warnings.simplefilter('ignore')

In [2]:
df = pd.read_csv('data/products_sentiment_train.tsv', sep='\t', header=None)
df.head()

Unnamed: 0,0,1
0,"2 . take around 10,000 640x480 pictures .",1
1,i downloaded a trial version of computer assoc...,1
2,the wrt54g plus the hga7t is a perfect solutio...,1
3,i dont especially like how music files are uns...,0
4,i was using the cheapie pail ... and it worked...,1


In [4]:
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer 
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import sent_tokenize


def text_preprocessing(text):
    tokens = word_tokenize(text)
    table = str.maketrans('', '', string.punctuation)
    tokens = [w.translate(table) for w in tokens]
    tokens = [word.lower() for word in tokens]
    tokens = [word for word in tokens if word.isalpha()]
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if not word in stop_words]
    return ' '.join(tokens)

In [5]:
df[0] = df[0].apply(text_preprocessing)

In [6]:
df.head()

Unnamed: 0,0,1
0,take around pictures,1
1,downloaded trial version computer associates e...,1
2,plus perfect solution need wireless coverage w...,1
3,dont especially like music files unstructured ...,0
4,using cheapie pail worked ok opening device fe...,1


In [7]:
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC

tf_idf = TfidfVectorizer()
lr = LogisticRegression()
pipe = Pipeline(steps=[('tf_idf', tf_idf), ('lr', lr)])

param_grid = {
    'tf_idf__ngram_range': [(1, 2), (1, 3), (1, 4), (2, 3), (2, 4), (2, 5), (3, 5), (2, 6), (3, 6)],
    'tf_idf__stop_words': ['english', None],
    'tf_idf__analyzer': ['word', 'char_wb'],
    'tf_idf__min_df': [0.1, 0.2, 0.3],
    'tf_idf__max_df': [0.8, 0.9, 1.0]
#     'lr__C': np.logspace(-4, 4, 4)
}

search = GridSearchCV(pipe, param_grid, n_jobs=-1, verbose=2)
search.fit(df[0], df[1])
print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)

Fitting 5 folds for each of 72 candidates, totalling 360 fits
Best parameter (CV score=0.703):
{'tf_idf__analyzer': 'char_wb', 'tf_idf__max_df': 0.8, 'tf_idf__min_df': 0.1, 'tf_idf__ngram_range': (2, 3)}


In [8]:
final_pipe = make_pipeline(
    TfidfVectorizer(
        analyzer='char_wb', 
        ngram_range=(2, 4),
        stop_words='english'
    ),
    LogisticRegression(),
)
final_pipe.fit(df[0], df[1])
df_test = pd.read_csv('data/products_sentiment_test.tsv', sep='\t')
prediction = final_pipe.predict(df_test['text'])


In [9]:
df_sample = pd.read_csv('data/products_sentiment_sample_submission.csv')
df_sample.head()

Unnamed: 0,Id,y
0,0,0
1,1,1
2,2,0
3,3,1
4,4,0


In [6]:
df_sample['y'] = prediction
df_sample.head()

Unnamed: 0,Id,y
0,0,1
1,1,0
2,2,1
3,3,1
4,4,0


In [7]:
df_sample.to_csv('prediction.csv', index=None)

# Версия 2(0.86). DistilBert
(лучше запускать в google colab)

In [None]:
! pip install transformers datasets

In [None]:
import pandas as pd
df = pd.read_csv('products_sentiment_train.tsv', sep='\t', header=None)
df.rename({0: 'text', 1: 'label'}, axis=1, inplace=True)
df.head()

In [None]:
df_test = pd.read_csv('products_sentiment_test.tsv', sep='\t')
df_test.head()

In [None]:
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer 
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import sent_tokenize

import nltk
nltk.download('punkt')
nltk.download('stopwords')

def text_preprocessing(text):
    tokens = word_tokenize(text)
    table = str.maketrans('', '', string.punctuation)
    tokens = [w.translate(table) for w in tokens]
    tokens = [word.lower() for word in tokens]
    tokens = [word for word in tokens if word.isalpha()]
    # stop_words = set(stopwords.words('english'))
    # tokens = [word for word in tokens if not word in stop_words]
    return ' '.join(tokens)

df['text'] = list(map(text_preprocessing, df['text']))

In [None]:
from sklearn.model_selection import train_test_split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['text'].values.tolist(), 
    df['label'].values.tolist(), 
    test_size=.4, 
    stratify=df['label'], 
    random_state=0
  )

In [None]:
from transformers import DistilBertTokenizerFast

tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

In [None]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)

In [None]:
import torch

class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])
    
train_dataset = CustomDataset(train_encodings, train_labels)
val_dataset = CustomDataset(val_encodings, val_labels)

In [None]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import numpy as np

def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred)
    precision = precision_score(y_true=labels, y_pred=pred)
    f1 = f1_score(y_true=labels, y_pred=pred)

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

In [None]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments
from transformers import EarlyStoppingCallback

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    evaluation_strategy="steps",
    eval_steps=500,
    seed=0,
    load_best_model_at_end=True,
)

model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,            # evaluation dataset
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=20)],
)

trainer.train()

In [None]:
df_test = pd.read_csv('products_sentiment_test.tsv', sep='\t')
df_test['text'] = list(map(text_preprocessing, df_test['text']))
test_texts = df_test['text'].values.tolist()

test_encodings = tokenizer(test_texts, truncation=True, padding=True)
test_dataset = CustomDataset(test_encodings, None)


In [None]:
raw_pred, _, _ = trainer.predict(test_dataset)

prediction = np.argmax(raw_pred, axis=1)

In [None]:
df_sample = pd.read_csv('products_sentiment_sample_submission.csv')
df_sample['y'] = prediction
df_sample.to_csv('prediction.csv', index=None)