In [1]:
import pandas as pd
import math 
from sklearn.model_selection import train_test_split

# Read the CSV data into a pandas dataframe
df_2020 = pd.read_csv('files/lesswrong_posts_2020.csv').sort_values(by='finalReviewVoteScoreHighKarma', ascending=False)
df_2021 = pd.read_csv('files/lesswrong_posts_2021.csv').sort_values(by='finalReviewVoteScoreHighKarma', ascending=False)
df_2022 = pd.read_csv('files/lesswrong_posts_2022.csv').sort_values(by='finalReviewVoteScoreHighKarma', ascending=False)

threshold_2020 = df_2020.iloc[49].finalReviewVoteScoreHighKarma
threshold_2021 = df_2021.iloc[49].finalReviewVoteScoreHighKarma
threshold_2022 = df_2022.iloc[49].finalReviewVoteScoreHighKarma

df_2020['winner'] = df_2020.finalReviewVoteScoreHighKarma > threshold_2020
df_2021['winner'] = df_2021.finalReviewVoteScoreHighKarma > threshold_2021
df_2022['winner'] = df_2022.finalReviewVoteScoreHighKarma > threshold_2022

In [2]:
data = pd.concat([df_2020, df_2021, df_2022])

In [3]:
# Define your features and target variable
X = data[['baseScore', 'voteCount', 'commentCount', 'year']]
y = data['winner']  

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Initialize the logistic regression model
logreg = LogisticRegression()

# Train the model
logreg.fit(X_train, y_train)

# Make predictions on the test set
y_pred = logreg.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.987746170678337
              precision    recall  f1-score   support

       False       0.99      1.00      0.99      2254
        True       0.60      0.29      0.39        31

    accuracy                           0.99      2285
   macro avg       0.80      0.64      0.69      2285
weighted avg       0.99      0.99      0.99      2285



In [5]:
data_2023 = pd.read_csv('files/lesswrong_posts_2023.csv')

In [6]:
# Assuming you have a DataFrame for 2023 posts
X_2023 = data_2023[['baseScore', 'voteCount', 'commentCount', 'year']]

# Predict probabilities
probabilities = logreg.predict_proba(X_2023)[:, 1]  # Get the probability of being in the top fifty

# # You can set a threshold to decide whether a post is likely to be in the top fifty
# threshold = 0.5  # This is an example, you may need to adjust it
# predictions = (probabilities > threshold).astype(int)


In [7]:
probabilities

array([0.00207386, 0.00400569, 0.00167052, ..., 0.00193948, 0.00053111,
       0.00180963])

In [8]:
data_2023['probability'] = probabilities

In [9]:
data_2023.sort_values(by="probability",ascending=False).head(50)

Unnamed: 0,postId,title,slug,pageUrl,postedAt,baseScore,voteCount,commentCount,finalReviewVoteScoreHighKarma,year,probability
1416,yA8DWsHJeFZhDcQuo,The Talk: a brief explanation of sexual dimorp...,the-talk-a-brief-explanation-of-sexual-dimorphism,https://www.lesswrong.com/posts/yA8DWsHJeFZhDc...,2023-09-18T16:23:56.073Z,472,217,71,0.0,2023,0.997633
3425,nnDTgmzRrzDMiPF9B,How much do you believe your results?,how-much-do-you-believe-your-results,https://www.lesswrong.com/posts/nnDTgmzRrzDMiP...,2023-05-06T20:31:31.277Z,439,198,13,0.0,2023,0.994658
3296,5spBue2z2tw4JuDCx,Steering GPT-2-XL by adding an activation vector,steering-gpt-2-xl-by-adding-an-activation-vector,https://www.lesswrong.com/posts/5spBue2z2tw4Ju...,2023-05-13T18:42:41.321Z,409,187,96,0.0,2023,0.987767
2553,kAmgdEjq2eYQkB5PP,Douglas Hofstadter changes his mind on Deep Le...,douglas-hofstadter-changes-his-mind-on-deep-le...,https://www.lesswrong.com/posts/kAmgdEjq2eYQkB...,2023-07-03T00:48:47.131Z,409,198,53,0.0,2023,0.981184
213,JEhW3HDMKzekDShva,Significantly Enhancing Adult Intelligence Wit...,significantly-enhancing-adult-intelligence-wit...,https://www.lesswrong.com/posts/JEhW3HDMKzekDS...,2023-12-12T18:14:51.438Z,382,170,159,0.0,2023,0.979828
3047,HcJPJxkyCsrpSdCii,Statement on AI Extinction - Signed by AGI Lab...,statement-on-ai-extinction-signed-by-agi-labs-...,https://www.lesswrong.com/posts/HcJPJxkyCsrpSd...,2023-05-30T09:05:25.986Z,372,161,73,0.0,2023,0.975519
4782,D7PumeYTDPfBTp3i7,The Waluigi Effect (mega-post),the-waluigi-effect-mega-post,https://www.lesswrong.com/posts/D7PumeYTDPfBTp...,2023-03-03T03:22:08.619Z,605,467,186,0.0,2023,0.952118
4572,psYNRb3JCncQBjd4v,Shutting Down the Lightcone Offices,shutting-down-the-lightcone-offices,https://www.lesswrong.com/posts/psYNRb3JCncQBj...,2023-03-14T22:47:51.539Z,337,144,93,0.0,2023,0.941858
2979,ejxwraMP5ye7Bgmpm,Things I Learned by Spending Five Thousand Hou...,things-i-learned-by-spending-five-thousand-hou...,https://www.lesswrong.com/posts/ejxwraMP5ye7Bg...,2023-06-01T20:48:03.940Z,355,171,32,0.0,2023,0.927056
3411,yT22RcWrxZcXyGjsA,How to have Polygenically Screened Children,how-to-have-polygenically-screened-children,https://www.lesswrong.com/posts/yT22RcWrxZcXyG...,2023-05-07T16:01:07.096Z,328,140,98,0.0,2023,0.9269


In [12]:
data_2023['probability']

0       0.002074
1       0.004006
2       0.001671
3       0.001826
4       0.000601
          ...   
4993    0.002706
4994    0.033977
4995    0.001939
4996    0.000531
4997    0.001810
Name: probability, Length: 4998, dtype: float64

In [14]:
# doing some regression to the mean
# use geometric mean of odds
base_rate = 0.14
data_2023['geo_mean_probability'] = data_2023['probability'].apply(lambda x: math.sqrt(x * base_rate))
# TODO : add curation status

In [16]:
data_2023.sort_values(by="probability",ascending=False).head(50)

Unnamed: 0,postId,title,slug,pageUrl,postedAt,baseScore,voteCount,commentCount,finalReviewVoteScoreHighKarma,year,probability,geo_mean_probability
1416,yA8DWsHJeFZhDcQuo,The Talk: a brief explanation of sexual dimorp...,the-talk-a-brief-explanation-of-sexual-dimorphism,https://www.lesswrong.com/posts/yA8DWsHJeFZhDc...,2023-09-18T16:23:56.073Z,472,217,71,0.0,2023,0.997633,0.373723
3425,nnDTgmzRrzDMiPF9B,How much do you believe your results?,how-much-do-you-believe-your-results,https://www.lesswrong.com/posts/nnDTgmzRrzDMiP...,2023-05-06T20:31:31.277Z,439,198,13,0.0,2023,0.994658,0.373165
3296,5spBue2z2tw4JuDCx,Steering GPT-2-XL by adding an activation vector,steering-gpt-2-xl-by-adding-an-activation-vector,https://www.lesswrong.com/posts/5spBue2z2tw4Ju...,2023-05-13T18:42:41.321Z,409,187,96,0.0,2023,0.987767,0.37187
2553,kAmgdEjq2eYQkB5PP,Douglas Hofstadter changes his mind on Deep Le...,douglas-hofstadter-changes-his-mind-on-deep-le...,https://www.lesswrong.com/posts/kAmgdEjq2eYQkB...,2023-07-03T00:48:47.131Z,409,198,53,0.0,2023,0.981184,0.370629
213,JEhW3HDMKzekDShva,Significantly Enhancing Adult Intelligence Wit...,significantly-enhancing-adult-intelligence-wit...,https://www.lesswrong.com/posts/JEhW3HDMKzekDS...,2023-12-12T18:14:51.438Z,382,170,159,0.0,2023,0.979828,0.370373
3047,HcJPJxkyCsrpSdCii,Statement on AI Extinction - Signed by AGI Lab...,statement-on-ai-extinction-signed-by-agi-labs-...,https://www.lesswrong.com/posts/HcJPJxkyCsrpSd...,2023-05-30T09:05:25.986Z,372,161,73,0.0,2023,0.975519,0.369557
4782,D7PumeYTDPfBTp3i7,The Waluigi Effect (mega-post),the-waluigi-effect-mega-post,https://www.lesswrong.com/posts/D7PumeYTDPfBTp...,2023-03-03T03:22:08.619Z,605,467,186,0.0,2023,0.952118,0.365098
4572,psYNRb3JCncQBjd4v,Shutting Down the Lightcone Offices,shutting-down-the-lightcone-offices,https://www.lesswrong.com/posts/psYNRb3JCncQBj...,2023-03-14T22:47:51.539Z,337,144,93,0.0,2023,0.941858,0.363125
2979,ejxwraMP5ye7Bgmpm,Things I Learned by Spending Five Thousand Hou...,things-i-learned-by-spending-five-thousand-hou...,https://www.lesswrong.com/posts/ejxwraMP5ye7Bg...,2023-06-01T20:48:03.940Z,355,171,32,0.0,2023,0.927056,0.360261
3411,yT22RcWrxZcXyGjsA,How to have Polygenically Screened Children,how-to-have-polygenically-screened-children,https://www.lesswrong.com/posts/yT22RcWrxZcXyG...,2023-05-07T16:01:07.096Z,328,140,98,0.0,2023,0.9269,0.36023


In [17]:
# Perform analysis on the dataframe
# For example:
# print(df.head())  # Display the first few rows of the dataframe
# ... (add your analysis code here)

# Save the notebook with the analysis
# For example:
data_2023.to_csv('data_2023.csv', index=False)