In [1]:
import json
import string
import re
import nltk
import pandas as pd
pd.options.mode.chained_assignment = None
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from collections import Counter
from sklearn.metrics import r2_score

from nltk.corpus import stopwords
", ".join(stopwords.words('english'))
Stopwords = set(stopwords.words('english'))

In [2]:
train_set = open("C:/Users/user/Desktop/ML_Challenge/train-1.json")
test_set = open("C:/Users/user/Desktop/ML_Challenge/test.json")
train = json.load(train_set)
test = json.load(test_set)

In [3]:
df_train = pd.DataFrame(train)
df_test = pd.DataFrame(test)

In [4]:
print(df_train.shape, df_test.shape)

(9658, 11) (1000, 10)


#### Clean Test Data

In [5]:
df_test = df_test.reindex(["doi", "title", "abstract", "authors", "venue", "year", "references",
                           "topics", "is_open_access", "fields_of_study"], axis=1)

In [6]:
df_test.isnull().sum()

doi                 0
title               0
abstract           19
authors             0
venue               0
year                0
references          0
topics              0
is_open_access      0
fields_of_study    13
dtype: int64

In [7]:
mode_fos = df_test["fields_of_study"].mode().values[0]
df_test["fields_of_study"] = df_test["fields_of_study"].apply(lambda d: d if isinstance(d, list) else mode_fos)
df_test["fields_of_study"] = [",".join(i) for i in df_test["fields_of_study"]]

In [8]:
df_test["authors"] = [",".join(i) for i in df_test["authors"]]
df_test["topics"] = [",".join(i) for i in df_test["topics"]]

In [9]:
df_test = df_test.replace(r'^\s*$', np.nan, regex=True)

In [10]:
df_test.isnull().sum()

doi                  0
title                0
abstract            19
authors              2
venue               56
year                 0
references           0
topics             231
is_open_access       0
fields_of_study      0
dtype: int64

In [11]:
df_test["abstract"] = df_test["abstract"].fillna("")
df_test["authors"] = df_test["authors"].fillna("")

In [12]:
mode_v = df_test["venue"].mode().values[0]
df_test["venue"] = df_test["venue"].fillna(mode_v)

In [13]:
mode_top = df_test["topics"].mode().values[0]
df_test["topics"] = df_test["topics"].fillna(mode_top)

In [14]:
df_test.shape

(1000, 10)

#### Clean Train Data

In [15]:
df_train.isnull().sum()

doi                  0
title                0
abstract           159
authors              0
venue                0
year                 3
references           0
topics               0
is_open_access       0
fields_of_study    136
citations            0
dtype: int64

In [16]:
mode_fos_train = df_train["fields_of_study"].mode().values[0]
df_train["fields_of_study"] = df_train["fields_of_study"].apply(lambda d: d if isinstance(d, list) else mode_fos_train)
df_train["fields_of_study"] = [",".join(i) for i in df_train["fields_of_study"]]

In [17]:
df_train["authors"] = [",".join(i) for i in df_train["authors"]]
df_train["topics"] = [",".join(i) for i in df_train["topics"]]

In [18]:
df_train = df_train.replace(r'^\s*$', np.nan, regex=True)

In [19]:
df_train.isnull().sum()

doi                   0
title                 0
abstract            159
authors              13
venue               462
year                  3
references            0
topics             2134
is_open_access        0
fields_of_study       0
citations             0
dtype: int64

In [20]:
df_train["abstract"] = df_train["abstract"].fillna("")
df_train["authors"] = df_train["authors"].fillna("")

In [21]:
mode_v_train = df_train["venue"].mode().values[0]
df_train["venue"] = df_train["venue"].fillna(mode_v_train)

In [22]:
mode_top_train = df_train["topics"].mode().values[0]
df_train["topics"] = df_train["topics"].fillna(mode_top_train)

In [23]:
mode_year = df_train["year"].mode().values[0]
df_train["year"] = df_train["year"].fillna(mode_year)

In [24]:
df_train.shape

(9658, 11)

In [25]:
df_train["year"] = df_train["year"].astype(int)

#### Combine Train and Test Data

In [26]:
df_combined = df_train.append(df_test, ignore_index=True)
df_combined.shape

(10658, 11)

In [27]:
df_combined.isnull().sum()

doi                   0
title                 0
abstract              0
authors               0
venue                 0
year                  0
references            0
topics                0
is_open_access        0
fields_of_study       0
citations          1000
dtype: int64

In [28]:
df_combined["citations"] = df_combined["citations"].fillna(0)

In [29]:
df_combined.isnull().sum()

doi                0
title              0
abstract           0
authors            0
venue              0
year               0
references         0
topics             0
is_open_access     0
fields_of_study    0
citations          0
dtype: int64

#### Preprocess and Extract Features for the Model

In [30]:
#Venue
def clean_venue(text):
    n = re.sub(r'[0-9]', "", text)
    p = re.sub(r'[^\w\s]', "", n.lower().strip())
    return p

In [31]:
df_combined["venue"] = df_combined["venue"].apply(lambda x: clean_venue(x))

In [32]:
venue_rank = df_combined.groupby('venue')['citations'].rank(pct=True, ascending = False)

In [33]:
df_combined.insert(loc=5, column='venue_rank', value=venue_rank)

In [34]:
#Authors
authors = df_combined[["authors"]]
authors['authors_k'] = authors['authors'].apply(lambda x: " ".join(x.lower() for x in x.split()))
authors['authors_p'] = [x.replace("."," ").split(',') for x in authors['authors_k']]
df_combined["authors"] = authors['authors_p']

In [35]:
df_combined["authors"] = [",".join(i) for i in df_combined["authors"]]

In [36]:
def split_and_count(column):
    lis = []
    for i in column:
        for j in i.split(","):
            lis.append(j)
            counts = Counter(lis)
    return counts

In [37]:
author_count = df_combined['authors'].apply(lambda x: len(x.split(',')))

In [38]:
df_combined.insert(loc=4, column='authors_count', value=author_count)

In [39]:
df = pd.DataFrame(list(split_and_count(df_combined["authors"]).items()),columns = ['authors','counts'])

In [40]:
def author_score(var1,var2,main_df,index): 
    k = 0
    for i,j in zip(var1,var2):
        if i in main_df[index]:
            #print(i, j)
            k += j
    return k

In [41]:
author_score(df["authors"], df["counts"],df_combined["authors"], 3)

31

In [42]:
author_scores = [author_score(df["authors"], df["counts"],df_combined["authors"], each) for each in df_combined.index]

In [43]:
df_combined.insert(loc=5, column='authors_score', value=author_scores)

In [44]:
df_combined["authors_score"] = df_combined["authors_score"] - 15

In [45]:
authors_rank = df_combined.groupby('authors_score')['citations'].rank(pct=True, ascending = True)

In [46]:
df_combined.insert(loc=6, column='authors_rank', value=authors_rank)

In [126]:
df_combined

Unnamed: 0,doi,title,abstract,authors,authors_count,authors_score,authors_rank,venue,venue_rank,year,references,topics,is_open_access,fields_of_study,citations
0,10.3115/v1/P15-1039,Generating High Quality Proposition Banks for ...,Semantic role labeling (SRL) is crucial to nat...,"a akbik,laura chiticariu,marina danilevsky,yu...",6,27,0.870861,acl,0.171881,2015,39,Semantic role labeling,True,Computer Science,60.0
1,10.18653/v1/2020.eval4nlp-1.12,One of these words is not like the other: a re...,Word embeddings are an active topic in the NLP...,"jesper brink andersen,mikkel bak bertelsen,mik...",5,9,0.294314,evalnlp,0.722222,2020,44,Natural language processing,True,Computer Science,1.0
2,10.18653/v1/W17-3516,The Code2Text Challenge: Text Generation in So...,We propose a new shared task for tactical data...,"kyle richardson,sina zarrieß,jonas kuhn",3,29,0.336066,inlg,0.507874,2017,30,"Natural language generation,Library (computing...",True,Computer Science,5.0
3,10.18653/v1/S17-2160,The Meaning Factory at SemEval-2017 Task 9: Pr...,We evaluate a semantic parser based on a chara...,"rik van noord,johan bos",2,16,0.472637,semevalacl,0.500000,2017,11,"Parsing,Convolutional neural network,Text-base...",True,Computer Science,5.0
4,10.18653/v1/W15-2205,Semantic Parsing for Textual Entailment,In this paper we gauge the utility of general-...,"elisabeth lien,milen kouylekov",2,5,0.720149,iwpt,0.096774,2015,26,"Textual entailment,Parsing,SemEval,Semantic We...",True,Computer Science,10.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10653,10.18653/v1/P16-1217,Automatic Labeling of Topic Models Using Text ...,Labeling topics learned by topic models is a c...,"xiaojun wan,tianming wang",2,33,0.109524,acl,0.926205,2016,25,"Automatic summarization,Algorithm,Submodular s...",True,Computer Science,0.0
10654,10.18653/v1/D15-1192,Learning to Identify the Best Contexts for Kno...,We outline a learning framework that aims at i...,"evgenia wasserman pritsker,william w cohen,ei...",3,13,0.115108,emnlp,0.937788,2015,35,"Word-sense disambiguation,Lexical substitution...",True,Computer Science,0.0
10655,10.18653/v1/2020.iwpt-1.8,Tensors over Semirings for Latent-Variable Wei...,Semiring parsing is an elegant framework for d...,"esma balkir,d gildea,shay b cohen",3,50,0.067164,iwpt,0.870968,2020,23,"Parsing,Logic programming,Latent variable",True,Computer Science,0.0
10656,10.1162/tacl_a_00302,A Knowledge-Enhanced Pretraining Model for Com...,"Story generation, namely, generating a reasona...","jian guan,fei huang,zhihao zhao,xiaoyan zhu,mi...",5,40,0.104651,tacl,0.968750,2020,100,Commonsense knowledge (artificial intelligence...,True,Computer Science,0.0


In [61]:
#split the training and test
training = df_combined.iloc[: 9658]
testing = df_combined.iloc[9658 : ]

In [62]:
X = training[["authors_rank", "venue_rank"]]
y = training[["citations"]]

In [63]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [65]:
print(X_train.shape, X_val.shape)

(7726, 2) (1932, 2)


In [66]:
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

In [68]:
linreg = LinearRegression()

linreg.fit(X_train,y_train)

y_pred = linreg.predict(X_val)

In [69]:
r2_score(y_val, y_pred)

0.06276283958468953

In [70]:
def score(Y_true, Y_pred):
    y_true = np.log1p(np.maximum(0, Y_true))
    y_pred = np.log1p(np.maximum(0, Y_pred))
    return 1 - np.mean((y_true-y_pred)**2) / np.mean((y_true-np.mean(y_true))**2)

def evaluate(gold_path, pred_path):
    gold = { x['doi']: x['citations'] for x in json.load(open(gold_path)) }
    pred = { x['doi']: x['citations'] for x in json.load(open(pred_path)) }
    y_true = np.array([ gold[key] for key in gold ])
    y_pred = np.array([ pred[key] for key in gold ])
    return score(y_true, y_pred)

In [71]:
score(y_val, y_pred)

citations    0.428503
dtype: float64

In [125]:
y_pred

array([[18.61034504],
       [52.31418034],
       [-5.36573606],
       ...,
       [10.47657228],
       [22.77578636],
       [63.30071967]])

In [73]:
#Testing on new data
testing.drop("citations", axis=1, inplace=True)

In [110]:
doi = pd.DataFrame(testing.iloc[: , 0])

In [111]:
testing_X = testing[["authors_rank", "venue_rank"]]

In [77]:
testing_X = scaler.transform(testing_X)

In [86]:
test_pred = linreg.predict(testing_X)

In [95]:
colname = ["citations"]

In [96]:
to_df = pd.DataFrame(data=test_pred, columns = colname)

In [115]:
predictions = pd.concat([doi.reset_index(drop=False),testing_X.reset_index(drop=False), to_df.reset_index(drop=False)], axis=1)

In [120]:
predictions.drop("index", axis=1, inplace=True)

In [121]:
predictions

Unnamed: 0,doi,authors_rank,venue_rank,citations
0,10.18653/v1/2021.findings-acl.255,0.102273,0.781491,-36.193852
1,10.18653/v1/2020.acl-main.200,0.132678,0.926205,-45.816634
2,10.18653/v1/W18-0211,0.160000,0.926205,-43.080026
3,10.18653/v1/N19-1413,0.120968,0.907300,-45.334609
4,10.18653/v1/2021.semeval-1.168,0.132678,0.906475,-44.089429
...,...,...,...,...
995,10.18653/v1/P16-1217,0.109524,0.926205,-48.135814
996,10.18653/v1/D15-1192,0.115108,0.937788,-48.590440
997,10.18653/v1/2020.iwpt-1.8,0.067164,0.870968,-47.543150
998,10.1162/tacl_a_00302,0.104651,0.968750,-52.348208


In [123]:
predicted = predictions.to_dict("records")

In [124]:
a_file = open("predicted.json", "w")
a_file = json.dump(predicted, a_file)