In [1]:
import pandas as pd
from typing import Iterable 


In [2]:
TWEET = 'tweet'
TWEET_CLEAN = 'clean_tweet'
TWEET_TOKENIZE = 'tweet_token'
TWEET_FILTERED = 'tweet_token_filtered'
TWEET_STEMMED = 'tweet_stemmed'
TWEET_LEMMATIZED = 'tweet_lemmatized'

In [3]:
df_train = pd.read_csv("hw1/content/train_tweets.csv")
df_test = pd.read_csv("hw1/content/test_tweets.csv")
df = pd.concat([df_train, df_test])
df.head(10)

Unnamed: 0,id,label,tweet
0,1,0.0,@user when a father is dysfunctional and is s...
1,2,0.0,@user @user thanks for #lyft credit i can't us...
2,3,0.0,bihday your majesty
3,4,0.0,#model i love u take with u all the time in ...
4,5,0.0,factsguide: society now #motivation
5,6,0.0,[2/2] huge fan fare and big talking before the...
6,7,0.0,@user camping tomorrow @user @user @user @use...
7,8,0.0,the next school year is the year for exams.ð...
8,9,0.0,we won!!! love the land!!! #allin #cavs #champ...
9,10,0.0,@user @user welcome here ! i'm it's so #gr...


In [4]:
from hw1.texteditor import TextTransformer, TextEditor

editor = TextEditor()

transfomer = TextTransformer([
    # Заменим html-сущности
    editor.convert_html_symbol,
    # Удалим @user из всех твитов 
    editor.remove_at_word,
    # Изменим регистр твитов на нижний
    editor.to_lower,
    # Заменим сокращения с апострофами
    editor.change_apostrophe_word,
    # Заменим сокращения на их полные формы
    editor.change_abbreviation_to_words,
    # Заменим эмотиконы
    editor.change_emoticon_to_word,   
    # Заменим пунктуацию на пробелы
    editor.remove_punctuation,
    # Заменим спец. символы на пробелы
    editor.remove_spec_symbol,
    # Заменим числа на пробелы
    editor.remove_numbers,
    # Удалим из текста слова длиной в 1 символ
    editor.remove_word_with_len_one
])

In [5]:
df[TWEET_CLEAN]=list(map(lambda text: transfomer.transform(text), df[TWEET]))
df.head(5)                    

Unnamed: 0,id,label,tweet,clean_tweet
0,1,0.0,@user when a father is dysfunctional and is s...,when father is dysfunctional and is so selfish...
1,2,0.0,@user @user thanks for #lyft credit i can't us...,thanks for lyft credit cannot use cause they d...
2,3,0.0,bihday your majesty,bihday your majesty
3,4,0.0,#model i love u take with u all the time in ...,model love you take with you all the time in ur
4,5,0.0,factsguide: society now #motivation,factsguide society now motivation


In [6]:
import nltk


In [7]:
# Поделим твиты на токены
if False : #выполнить только один раз для подгрузки билиотеки
    nltk.download('punkt')
from nltk.tokenize import word_tokenize

df[TWEET_TOKENIZE] = list(map(lambda text: word_tokenize(text), df[TWEET_CLEAN]))
df.head(5)

Unnamed: 0,id,label,tweet,clean_tweet,tweet_token
0,1,0.0,@user when a father is dysfunctional and is s...,when father is dysfunctional and is so selfish...,"[when, father, is, dysfunctional, and, is, so,..."
1,2,0.0,@user @user thanks for #lyft credit i can't us...,thanks for lyft credit cannot use cause they d...,"[thanks, for, lyft, credit, can, not, use, cau..."
2,3,0.0,bihday your majesty,bihday your majesty,"[bihday, your, majesty]"
3,4,0.0,#model i love u take with u all the time in ...,model love you take with you all the time in ur,"[model, love, you, take, with, you, all, the, ..."
4,5,0.0,factsguide: society now #motivation,factsguide society now motivation,"[factsguide, society, now, motivation]"


In [8]:
# Удалим стоп-слова из токенов
if False: #выполнить только один раз для подгрузки билиотеки
    nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = set(stopwords.words("english"))

def remove_stopword(words:Iterable[str])->list:
    return [word for word in words if not word in stop_words]

df[TWEET_FILTERED] = list(map(lambda words: remove_stopword(words), df[TWEET_TOKENIZE]))
df.head()


Unnamed: 0,id,label,tweet,clean_tweet,tweet_token,tweet_token_filtered
0,1,0.0,@user when a father is dysfunctional and is s...,when father is dysfunctional and is so selfish...,"[when, father, is, dysfunctional, and, is, so,...","[father, dysfunctional, selfish, drags, kids, ..."
1,2,0.0,@user @user thanks for #lyft credit i can't us...,thanks for lyft credit cannot use cause they d...,"[thanks, for, lyft, credit, can, not, use, cau...","[thanks, lyft, credit, use, cause, offer, whee..."
2,3,0.0,bihday your majesty,bihday your majesty,"[bihday, your, majesty]","[bihday, majesty]"
3,4,0.0,#model i love u take with u all the time in ...,model love you take with you all the time in ur,"[model, love, you, take, with, you, all, the, ...","[model, love, take, time, ur]"
4,5,0.0,factsguide: society now #motivation,factsguide society now motivation,"[factsguide, society, now, motivation]","[factsguide, society, motivation]"


In [9]:
# Применим стемминг к токенам
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

def stem(words:Iterable[str])->list[str]:
    return [stemmer.stem(word) for word in words]

df[TWEET_STEMMED] = list(map(lambda words: stem(words), df[TWEET_FILTERED]))
df.head()

Unnamed: 0,id,label,tweet,clean_tweet,tweet_token,tweet_token_filtered,tweet_stemmed
0,1,0.0,@user when a father is dysfunctional and is s...,when father is dysfunctional and is so selfish...,"[when, father, is, dysfunctional, and, is, so,...","[father, dysfunctional, selfish, drags, kids, ...","[father, dysfunct, selfish, drag, kid, dysfunc..."
1,2,0.0,@user @user thanks for #lyft credit i can't us...,thanks for lyft credit cannot use cause they d...,"[thanks, for, lyft, credit, can, not, use, cau...","[thanks, lyft, credit, use, cause, offer, whee...","[thank, lyft, credit, use, caus, offer, wheelc..."
2,3,0.0,bihday your majesty,bihday your majesty,"[bihday, your, majesty]","[bihday, majesty]","[bihday, majesti]"
3,4,0.0,#model i love u take with u all the time in ...,model love you take with you all the time in ur,"[model, love, you, take, with, you, all, the, ...","[model, love, take, time, ur]","[model, love, take, time, ur]"
4,5,0.0,factsguide: society now #motivation,factsguide society now motivation,"[factsguide, society, now, motivation]","[factsguide, society, motivation]","[factsguid, societi, motiv]"


In [12]:
# Применим лемматизацию к токенам
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
if False: #выполнить только один раз для подгрузки билиотеки
    nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

def lemmatize(words:Iterable[str])->list[str]:
    return [lemmatizer.lemmatize(word, wordnet.VERB) for word in words]

df[TWEET_LEMMATIZED] = list(map(lambda words: lemmatize(words), df[TWEET_FILTERED]))
df.head()

[nltk_data] Downloading package wordnet to C:\Users\Alexander
[nltk_data]     Solomennik\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


Unnamed: 0,id,label,tweet,clean_tweet,tweet_token,tweet_token_filtered,tweet_stemmed,tweet_lemmatized
0,1,0.0,@user when a father is dysfunctional and is s...,when father is dysfunctional and is so selfish...,"[when, father, is, dysfunctional, and, is, so,...","[father, dysfunctional, selfish, drags, kids, ...","[father, dysfunct, selfish, drag, kid, dysfunc...","[father, dysfunctional, selfish, drag, kid, dy..."
1,2,0.0,@user @user thanks for #lyft credit i can't us...,thanks for lyft credit cannot use cause they d...,"[thanks, for, lyft, credit, can, not, use, cau...","[thanks, lyft, credit, use, cause, offer, whee...","[thank, lyft, credit, use, caus, offer, wheelc...","[thank, lyft, credit, use, cause, offer, wheel..."
2,3,0.0,bihday your majesty,bihday your majesty,"[bihday, your, majesty]","[bihday, majesty]","[bihday, majesti]","[bihday, majesty]"
3,4,0.0,#model i love u take with u all the time in ...,model love you take with you all the time in ur,"[model, love, you, take, with, you, all, the, ...","[model, love, take, time, ur]","[model, love, take, time, ur]","[model, love, take, time, ur]"
4,5,0.0,factsguide: society now #motivation,factsguide society now motivation,"[factsguide, society, now, motivation]","[factsguide, society, motivation]","[factsguid, societi, motiv]","[factsguide, society, motivation]"


In [13]:
df.to_pickle('hw1\df_preprocessing.pkl')
