In [124]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


# 1 Load and clean data

In [125]:
# load answer data
# deleted line 801 in both files because it caused an error (had no score -> not usable for training)
column_names = ['Text String 1', 'Text String 2', 'Metadata 1', 'Metadata 2']
df = pd.read_csv('sts2016-english-with-gs-v1.0/STS2016.input.answer-answer.txt', sep='\t', header=None, names=column_names)
# load gs data
data = []
with open('sts2016-english-with-gs-v1.0/STS2016.gs.answer-answer.txt', 'r') as file:
    for line in file:
        # If the line is empty or just contains a newline character, add NaN
        if line.strip() == '':
            data.append([np.nan])
        else:
            # Otherwise, add the value to the data list
            data.append([line.strip()])
            
gs = pd.DataFrame(data, columns=['score'])

In [126]:
df.info()
gs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1571 entries, 0 to 1570
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Text String 1  1571 non-null   object
 1   Text String 2  1571 non-null   object
 2   Metadata 1     1571 non-null   object
 3   Metadata 2     1571 non-null   object
dtypes: object(4)
memory usage: 49.2+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1571 entries, 0 to 1570
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   score   254 non-null    object
dtypes: object(1)
memory usage: 12.4+ KB


In [127]:
# merge dataframes
df = pd.concat([df, gs], axis=1) # concat hier, weil wir keine gemeinsame Spalte haben aber die Reihenfolge gleich ist
df

Unnamed: 0,Text String 1,Text String 2,Metadata 1,Metadata 2,score
0,Tasting it is the only reliable way.,The way you have it is fine.,StackExchange Network: http://cooking.stackexc...,StackExchange Network: http://writers.stackexc...,
1,I think it probably depends on your money.,It depends on your country.,StackExchange Network: http://workplace.stacke...,StackExchange Network: http://travel.stackexch...,
2,You need to read a lot to know what you like a...,You don't have to know.,StackExchange Network: http://writers.stackexc...,StackExchange Network: http://academia.stackex...,
3,"Obviously, the best book for you depends a lot...",The answer will depend of course on what you'r...,StackExchange Network: http://travel.stackexch...,StackExchange Network: http://travel.stackexch...,
4,I've had this same problem.,I had the same problem as you.,StackExchange Network: http://diy.stackexchang...,StackExchange Network: http://cooking.stackexc...,
...,...,...,...,...,...
1566,You are on the right path.,You are right about the overtaking rules.,StackExchange Network: http://money.stackexcha...,StackExchange Network: http://travel.stackexch...,
1567,This is a terrible idea.,This is a bad idea.,StackExchange Network: http://fitness.stackexc...,StackExchange Network: http://cooking.stackexc...,5
1568,I have the same thing.,I have had the same issue last couple of years.,StackExchange Network: http://diy.stackexchang...,StackExchange Network: http://pets.stackexchan...,
1569,I don't think it makes any tremendous difference.,I don't think that there's any.,StackExchange Network: http://cooking.stackexc...,StackExchange Network: http://academia.stackex...,


In [128]:
# clean data
# 1 convert score to float
df['score'] = pd.to_numeric(df['score'], errors='coerce') # coerce -> if error, set to NaN

# this is the pattern we want to extract (made by Chat.GPT)
pattern = r'^(.*): (\bhttps?://\S+)'

# apply regex to metadata1 and metadata2 and store it in source_1 source_1_url source_2 source_2_url
df[['source_1', 'source_1_url']] = df['Metadata 1'].str.extract(pattern)
df[['source_2', 'source_2_url']] = df['Metadata 2'].str.extract(pattern)

# rename all columns to use snake_case
df.columns = df.columns.str.lower().str.replace(' ', '_')
df.columns = df.columns.str.strip(' ')

In [129]:
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1571 entries, 0 to 1570
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   text_string_1  1571 non-null   object 
 1   text_string_2  1571 non-null   object 
 2   metadata_1     1571 non-null   object 
 3   metadata_2     1571 non-null   object 
 4   score          254 non-null    float64
 5   source_1       1571 non-null   object 
 6   source_1_url   1571 non-null   object 
 7   source_2       1571 non-null   object 
 8   source_2_url   1571 non-null   object 
dtypes: float64(1), object(8)
memory usage: 110.6+ KB


Unnamed: 0,text_string_1,text_string_2,metadata_1,metadata_2,score,source_1,source_1_url,source_2,source_2_url
0,Tasting it is the only reliable way.,The way you have it is fine.,StackExchange Network: http://cooking.stackexc...,StackExchange Network: http://writers.stackexc...,,StackExchange Network,http://cooking.stackexchange.com/questions/5611,StackExchange Network,http://writers.stackexchange.com/questions/11017
1,I think it probably depends on your money.,It depends on your country.,StackExchange Network: http://workplace.stacke...,StackExchange Network: http://travel.stackexch...,,StackExchange Network,http://workplace.stackexchange.com/questions/1755,StackExchange Network,http://travel.stackexchange.com/questions/45030
2,You need to read a lot to know what you like a...,You don't have to know.,StackExchange Network: http://writers.stackexc...,StackExchange Network: http://academia.stackex...,,StackExchange Network,http://writers.stackexchange.com/questions/12166,StackExchange Network,http://academia.stackexchange.com/questions/26241
3,"Obviously, the best book for you depends a lot...",The answer will depend of course on what you'r...,StackExchange Network: http://travel.stackexch...,StackExchange Network: http://travel.stackexch...,,StackExchange Network,http://travel.stackexchange.com/questions/4582,StackExchange Network,http://travel.stackexchange.com/questions/23436
4,I've had this same problem.,I had the same problem as you.,StackExchange Network: http://diy.stackexchang...,StackExchange Network: http://cooking.stackexc...,,StackExchange Network,http://diy.stackexchange.com/questions/2138,StackExchange Network,http://cooking.stackexchange.com/questions/12500
