**Guess the date of reddits (large edition)**

**Task**: Guess a reddit date based on its text. This is larger version with more reddits and subrredits (topics)

*Adam Mickiewicz University*

*Faculty of Mathematics and Computer Science*

*Subject: Machine translation*

In [1]:
!git clone https://git.wmi.amu.edu.pl/dawjur/guess-reddit-date-sumo.git

Cloning into 'guess-reddit-date-sumo'...
remote: Enumerating objects: 20, done.[K
remote: Counting objects: 100% (20/20), done.[K
remote: Compressing objects: 100% (16/16), done.[K
remote: Total 20 (delta 2), reused 0 (delta 0)
Unpacking objects: 100% (20/20), done.


In [2]:
!xzcat "guess-reddit-date-sumo/train/in.tsv.xz" | wc -l

5000000


In [3]:
import sys
import numpy as np

from sklearn.linear_model import LinearRegression
import sklearn.metrics
import sklearn.decomposition
import sklearn.feature_extraction.text
from sklearn.feature_extraction.text import CountVectorizer
import csv
import datetime
import lzma
import pandas as pd
from sklearn.metrics import mean_squared_error

In [4]:
def read_file_to_list(path):
  row_list = []
  with lzma.open(path) as fp:
    while True:
      line = fp.readline() 
      if not line:
        break
      row_list.append(line)
  return row_list

In [5]:
def load_set(path, isTest):
  dataset = pd.DataFrame(read_file_to_list("guess-reddit-date-sumo/"+path+"/in.tsv.xz"),columns=["text"])
  if not isTest:
    expected = pd.read_csv("guess-reddit-date-sumo/"+path+"/expected.tsv.xz",header=None,names=["year"])
    return dataset, expected
  return dataset

In [6]:
train_set, expected_train = load_set("train", False)
dev_set, expected_dev = load_set("dev-0", False)
test_set = load_set("test-A", True)

In [7]:
test_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 1 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    100000 non-null  object
dtypes: object(1)
memory usage: 781.4+ KB


In [8]:
vectorizer = sklearn.feature_extraction.text.TfidfVectorizer(
    stop_words='english',
    ngram_range=(1, 1), 
)

In [9]:
train_set = train_set.fillna("No text")

In [10]:
train_data = vectorizer.fit_transform(train_set["text"])

In [11]:
pca = sklearn.decomposition.TruncatedSVD(n_components=100)
data = pca.fit_transform(train_data)
data

array([[ 0.01287501, -0.00295359, -0.00097173, ..., -0.00493291,
         0.00203225,  0.00675423],
       [ 0.11081329, -0.00849485,  0.0384299 , ..., -0.00331018,
         0.02321383, -0.00240181],
       [ 0.11272593, -0.01940163, -0.00712396, ..., -0.00138703,
         0.00560076, -0.00931112],
       ...,
       [ 0.16472329, -0.03708477, -0.03033352, ..., -0.045385  ,
         0.00520277,  0.02801315],
       [ 0.01377693, -0.00300024,  0.00029046, ...,  0.03991493,
        -0.02230031,  0.09500838],
       [ 0.0617862 , -0.01005282, -0.00280986, ..., -0.01976292,
        -0.02490221,  0.00413727]])

In [12]:
regression = LinearRegression()
regression.fit(data,expected_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [13]:
mean_squared_error(regression.predict(data),expected_train)

1.8575883487072233

In [14]:
dev_set

Unnamed: 0,text
0,b'I love these.\n'
1,b'#39!\n'
2,b'Anything [here](https://www.reddit.com/r/gam...
3,"b""Source? Not being a dick, just actually wond..."
4,b'At least Ribery kind of has a reason to look...
...,...
99995,b'Why is it that in the US people are just pic...
99996,b'5429-7980-8121\n'
99997,"b""If I was a Bayern fan I'd be pumped that a p..."
99998,"b""Honestly I enjoyed it quite a lot. Despite m..."


In [15]:
def transform_data(raw_data):
  raw_data = raw_data.fillna("No text")
  vector = vectorizer.transform(raw_data["text"])
  clean_data = pca.transform(vector)
  return clean_data

In [16]:
dev_transformed = transform_data(dev_set)
predict_dev = regression.predict(dev_transformed)
predict_dev

array([[2013.7801284 ],
       [2013.69552127],
       [2013.34242613],
       ...,
       [2013.67977629],
       [2013.85753807],
       [2013.54962279]])

In [17]:
test_transformed = transform_data(test_set)
predict_test = regression.predict(test_transformed)
predict_test

array([[2013.68361134],
       [2013.70100938],
       [2013.64563541],
       ...,
       [2013.60782988],
       [2013.65974392],
       [2013.68198109]])

In [18]:
mean_squared_error(predict_dev,expected_dev)

1.882502665661785

In [19]:
np.savetxt('guess-reddit-date-sumo/test-A/out.tsv', predict_test, '%f')
np.savetxt('guess-reddit-date-sumo/dev-0/out.tsv', predict_dev, '%f')