In [1]:
import pandas as pd
import numpy as np
import spacy
nlp = spacy.load("en_core_web_sm")

# remove misformatted row due to bad html
raw_df = (
    pd.read_csv('data/raw/quakenu_raw.csv')
        .drop(16803, axis=0)
)

# Use 3 decimal places in output display
pd.set_option("display.precision", 3)

# Set max rows displayed in output to 25
pd.set_option("display.max_rows", 25)
# print(raw_df[:1])
# print(raw_df.dtypes)

In [2]:
import datetime

cleaning_test_df = raw_df[::1000] # every 1000th post

def clean_post_df(raw_df):
    clean_df = raw_df
    clean_df['date'] = pd.to_datetime(clean_df['date'], format='%Y-%m-%d, %H:%M')
    return clean_df

clean_test_df = clean_post_df(cleaning_test_df)
clean_df = clean_post_df(raw_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_df['date'] = pd.to_datetime(clean_df['date'], format='%Y-%m-%d, %H:%M')


In [3]:
import re
raw_text_col = raw_df['post_text'].str.replace('[^\w\']', ' ', flags=re.UNICODE)

  raw_text_col = raw_df['post_text'].str.replace('[^\w\']', ' ', flags=re.UNICODE)


In [85]:
# print(raw_df.columns)
OLD_MEMBER_CUTOFF = 50

forum_first_post_dt = raw_df['date'].min()
forum_time_range = raw_df['date'].max() - forum_first_post_dt
# print(forum_time_range)

author_groups = raw_df.groupby('author')
last_post_dt = author_groups['date'].max()
first_post_dt = author_groups['date'].min()
post_time_range = last_post_dt - first_post_dt
last_post_era = (last_post_dt - forum_first_post_dt) / forum_time_range

# print(last_post_era)

post_frequency_seconds = post_time_range.dt.total_seconds() / author_groups['date'].count()
is_old_member = author_groups['date'].count() > OLD_MEMBER_CUTOFF

median_post_length_chars = raw_df['post_text'].str.len().dropna().groupby(raw_df['author']).median()
# mean_post_length_chars = raw_df['post_text'].str.len().dropna().groupby(raw_df['author']).mean()

# print(sum(is_old_member))
# print(raw_df[raw_df['post_text'].isna()])

In [89]:
import sklearn as sk
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score, roc_curve, roc_auc_score, classification_report, accuracy_score, confusion_matrix 

# X = pd.concat([post_frequency_seconds, last_post_era], axis=1, keys=['post_freq', 'post_era'])
X = pd.concat([post_frequency_seconds, last_post_era, median_post_length_chars], axis=1, keys=['post_freq', 'post_era', 'post_len'])
X['post_len'].fillna(X['post_len'].median(), inplace=True)
y = is_old_member

np.random.seed(3342)
X_train, X_test, y_train, y_test = train_test_split(X, y)

scaler = StandardScaler()
classifier = LogisticRegression()

old_member_pipeline = Pipeline([
    ('scaler', scaler),
    ('classifier', classifier)
])

In [90]:
old_member_pipeline.fit(X_train, y_train)

Pipeline(steps=[('scaler', StandardScaler()),
                ('classifier', LogisticRegression())])

In [91]:
y_hat_test = old_member_pipeline.predict(X_test)

print(classification_report(y_test, y_hat_test))

y_hat_train = old_member_pipeline.predict(X_train)

print(classification_report(y_train, y_hat_train, digits=4))

              precision    recall  f1-score   support

       False       0.90      0.99      0.94       470
        True       0.42      0.09      0.14        57

    accuracy                           0.89       527
   macro avg       0.66      0.54      0.54       527
weighted avg       0.85      0.89      0.85       527

              precision    recall  f1-score   support

       False     0.8922    0.9833    0.9355      1380
        True     0.5965    0.1717    0.2667       198

    accuracy                         0.8815      1578
   macro avg     0.7443    0.5775    0.6011      1578
weighted avg     0.8551    0.8815    0.8516      1578



In [40]:
print(raw_df['post_text'].str.len().groupby(raw_df['author'], sort=False).mean())

author
3       270.997
122     106.881
6       154.400
1       305.435
47      184.000
         ...   
2519    347.333
3415    157.000
3533    414.000
3395     67.000
2533    835.500
Name: post_text, Length: 2105, dtype: float64


In [6]:
"""
Text gathering/cleaning issues
    - scraper not catching underlines, italics, etc https://www.quakeworld.nu/forum/topic/28
    - bb-code divs https://www.quakeworld.nu/forum/topic/28
    - 
"""

pd.set_option('display.max_colwidth', None)
"""
Cleaning a single string for NLP
    
    - remove nonchars except for apostrophes

"""

print(raw_df[['thread', 'post_id', 'post_text']].sample(n=5))
# print(raw_text_col.sample(n=5))
# print([text for text in raw_text_col.sample(n=15)])

       thread post_id  \
11289    2686   35508   
12773    1044   13619   
38627    4840   58876   
26371    2159   26602   
40673    4509   54829   

                                                                                                                                                         post_text  
11289                                                                                                                                                       fixed   
12773                                                                                                    still looking for clan? contact us at: contact@clansq.org  
38627  hehe, so it seems like everything boils down to peoples lack of knowledge and understanding. (which was my point when i quoted hooraytion some posts above)  
26371                                                                                       yeah something wrong with the pl's hopefully we can solve that in time  
40673                   

In [7]:
import unidecode

test_accent_str = "Sí señor, ¿qué necesita?"

print(unidecode.unidecode(test_accent_str))

test_lemma_str = "I have question. joe many liberals does it take to change a log by bolb? none. their too bussy. they're gender?"

test_lemma_doc = nlp(test_lemma_str)

# for token in test_lemma_doc:
#     print(token.is_sent_start) 

print(' '.join([token.lemma_ for token in test_lemma_doc]))
# print(test_lemma_doc.cats)

Si senor, ?que necesita?
I have question . joe many liberal do it take to change a log by bolb ? none . their too bussy . they be gender ?
