# Extractind data from zip file

In [1]:
import zipfile
import json
import sqlite3
import pandas as pd
import seaborn as sns

conn = sqlite3.connect("scopus.db")
cur = conn.cursor()

cur.execute("""
CREATE TABLE IF NOT EXISTS papers_raw (
    file_id INTEGER,
    year INTEGER,
    raw_json TEXT
)
""")

conn.commit()
conn.close()

keep = "ScopusData2018-2023/2018/201800000"
zip_path = "ScopusData2018-2023.zip"
years = ["2018", "2019", "2020", "2021", "2022", "2023"]

def loader(year: int, start_id: int, end_id: int):
    conn = sqlite3.connect("scopus.db")
    cur = conn.cursor()

    with zipfile.ZipFile(zip_path, "r") as z:
        for file_id in range(start_id, end_id + 1):
            inner_path = f"ScopusData2018-2023/{year}/{file_id}"
            try:
                with z.open(inner_path) as f:
                    try:
                        obj = json.load(f)
                    except Exception:
                        continue
            except KeyError:
                continue
            raw_text = json.dumps(obj, ensure_ascii=False)
            cur.execute(
                """
                INSERT INTO papers_raw (file_id, year, raw_json)
                VALUES (?, ?, ?)
                """,
                (file_id, year, raw_text)
            )
    conn.commit()
    conn.close()




In [2]:

# 2018
loader(2018, 201800000, 201802761)

# 2019
loader(2019, 201900000, 201903081)

# 2020
loader(2020, 202000000, 202003392)

# 2021
loader(2021, 202100000, 202103814)

# 2022
loader(2022, 202200000, 202204243)

# 2023
loader(2023, 202300000, 202302889)

KeyboardInterrupt: 

In [24]:
conn = sqlite3.connect("scopus.db")

df_2018 = pd.read_sql_query(
    """
    SELECT file_id, year, raw_json
    FROM papers_raw
    WHERE year = 2018
    """,
    conn
)

conn.close()

print(df_2018.shape)
print(df_2018.head())

(2762, 3)
     file_id  year                                           raw_json
0  201800000  2018  {"abstracts-retrieval-response": {"item": {"ai...
1  201800001  2018  {"abstracts-retrieval-response": {"item": {"ai...
2  201800002  2018  {"abstracts-retrieval-response": {"item": {"ai...
3  201800003  2018  {"abstracts-retrieval-response": {"item": {"ai...
4  201800004  2018  {"abstracts-retrieval-response": {"item": {"ai...


In [1]:
import sqlite3, pandas as pd
conn = sqlite3.connect("scopus.db")
#If you updated the code and want the updated scopus.db delete it first then rub this cell again.
if("df" not in globals()):
    SQL = """
    WITH base AS (
      SELECT
        file_id,
        year,

        -- basics
        json_extract(raw_json,'$."abstracts-retrieval-response".item.bibrecord.head."citation-title"') AS citation_title,
        json_extract(raw_json,'$."abstracts-retrieval-response".item.bibrecord.head.abstracts')        AS abstracts,
        json_extract(raw_json,'$."abstracts-retrieval-response".item.bibrecord.head.source.publisher.publishername') AS publishername,
        json_extract(raw_json,'$."abstracts-retrieval-response".item.bibrecord.head.source.sourcetitle')             AS sourcetitle,

        /* publication_date: DD/MM/YYYY */
        CASE
          WHEN json_extract(raw_json,'$."abstracts-retrieval-response".item.bibrecord.head.source.publicationdate.day')   IS NOT NULL
           AND json_extract(raw_json,'$."abstracts-retrieval-response".item.bibrecord.head.source.publicationdate.month') IS NOT NULL
           AND json_extract(raw_json,'$."abstracts-retrieval-response".item.bibrecord.head.source.publicationdate.year')  IS NOT NULL
          THEN printf('%02d/%02d/%04d',
                      CAST(json_extract(raw_json,'$."abstracts-retrieval-response".item.bibrecord.head.source.publicationdate.day')   AS INTEGER),
                      CAST(json_extract(raw_json,'$."abstracts-retrieval-response".item.bibrecord.head.source.publicationdate.month') AS INTEGER),
                      CAST(json_extract(raw_json,'$."abstracts-retrieval-response".item.bibrecord.head.source.publicationdate.year')  AS INTEGER))
          ELSE NULL
        END AS publication_date,

        /* ce:doi -> document_classification_codes */
        COALESCE(
          json_extract(raw_json,'$."abstracts-retrieval-response".item."item-info"."itemidlist"."ce:doi"'),
          (SELECT t.value
           FROM json_tree(raw_json, '$."abstracts-retrieval-response"') AS t
           WHERE t.key = 'ce:doi'
           LIMIT 1)
        ) AS document_classification_codes,

        -- counts
        json_extract(raw_json,'$."abstracts-retrieval-response".item.bibrecord.tail.bibliography."@refcount"') AS refcount,
        CAST(
          COALESCE(
            json_extract(raw_json,'$."abstracts-retrieval-response".coredata."citedby-count"'),
            json_extract(raw_json,'$."abstracts-retrieval-response".item.coredata."citedby-count"'),
            (SELECT t.value
             FROM json_tree(raw_json, '$."abstracts-retrieval-response"') AS t
             WHERE t.key = 'citedby-count'
             LIMIT 1)
          ) AS INTEGER
        ) AS citedbycount,

        /* allauthors_name = JSON array of "<given> <surname>" (no degrees) */
        (
          SELECT json_group_array(name_str)
          FROM (
            SELECT
              TRIM(
                TRIM(COALESCE(json_extract(a.value,'$."preferred-name"."ce:given-name"'),
                              json_extract(a.value,'$."ce:given-name"'), ''))
                || ' ' ||
                TRIM(COALESCE(json_extract(a.value,'$."preferred-name"."ce:surname"'),
                              json_extract(a.value,'$."ce:surname"'), ''))
              ) AS name_str
            FROM json_each(
              CASE json_type(json_extract(raw_json,'$."abstracts-retrieval-response".authors.author'))
                WHEN 'array'  THEN json_extract(raw_json,'$."abstracts-retrieval-response".authors.author')
                WHEN 'object' THEN json_array(json_extract(raw_json,'$."abstracts-retrieval-response".authors.author'))
                ELSE json_array()
              END
            ) AS a
            WHERE TRIM(
                    COALESCE(json_extract(a.value,'$."preferred-name"."ce:given-name"'),
                             json_extract(a.value,'$."ce:given-name"'), '') || ' ' ||
                    COALESCE(json_extract(a.value,'$."preferred-name"."ce:surname"'),
                             json_extract(a.value,'$."ce:surname"'), '')
                ) <> ''
          )
        ) AS allauthors_name,

        /* categories (subject → abbrev) */
        (
          SELECT json_group_array(json_object(subject, abbrev))
          FROM (
            SELECT DISTINCT
              json_extract(sa.value,'$."$"')        AS subject,
              json_extract(sa.value,'$."@abbrev"')  AS abbrev
            FROM (
              -- primary
              SELECT * FROM json_each(
                CASE json_type(json_extract(raw_json,'$."abstracts-retrieval-response"."subject-areas"."subject-area"'))
                  WHEN 'array'  THEN json_extract(raw_json,'$."abstracts-retrieval-response"."subject-areas"."subject-area"')
                  WHEN 'object' THEN json_array(json_extract(raw_json,'$."abstracts-retrieval-response"."subject-areas"."subject-area"'))
                  ELSE json_array()
                END
              )
              UNION ALL
              -- fallback A
              SELECT * FROM json_each(
                CASE json_type(json_extract(raw_json,'$."abstracts-retrieval-response".item.coredata."subject-areas"."subject-area"'))
                  WHEN 'array'  THEN json_extract(raw_json,'$."abstracts-retrieval-response".item.coredata."subject-areas"."subject-area"')
                  WHEN 'object' THEN json_array(json_extract(raw_json,'$."abstracts-retrieval-response".item.coredata."subject-areas"."subject-area"'))
                  ELSE json_array()
                END
              )
              UNION ALL
              -- fallback B
              SELECT * FROM json_each(
                CASE json_type(json_extract(raw_json,'$."abstracts-retrieval-response".coredata."subject-areas"."subject-area"'))
                  WHEN 'array'  THEN json_extract(raw_json,'$."abstracts-retrieval-response".coredata."subject-areas"."subject-area"')
                  WHEN 'object' THEN json_array(json_extract(raw_json,'$."abstracts-retrieval-response".coredata."subject-areas"."subject-area"'))
                  ELSE json_array()
                END
              )
            ) AS sa
            WHERE subject IS NOT NULL AND abbrev IS NOT NULL
          )
        ) AS categories,

        /* creator name */
        (
          SELECT name_full
          FROM (
            SELECT TRIM(
                     COALESCE(json_extract(a.value,'$."preferred-name"."ce:given-name"'),
                              json_extract(a.value,'$."ce:given-name"'), '') || ' ' ||
                     COALESCE(json_extract(a.value,'$."preferred-name"."ce:surname"'),
                              json_extract(a.value,'$."ce:surname"'), '')
                   ) AS name_full
            FROM (
              SELECT * FROM json_each(
                CASE json_type(json_extract(raw_json,'$."abstracts-retrieval-response".item.coredata."dc:creator".author'))
                  WHEN 'array'  THEN json_extract(raw_json,'$."abstracts-retrieval-response".item.coredata."dc:creator".author')
                  WHEN 'object' THEN json_array(json_extract(raw_json,'$."abstracts-retrieval-response".item.coredata."dc:creator".author'))
                  ELSE json_array()
                END
              )
              UNION ALL
              SELECT * FROM json_each(
                CASE json_type(json_extract(raw_json,'$."abstracts-retrieval-response".coredata."dc:creator".author'))
                  WHEN 'array'  THEN json_extract(raw_json,'$."abstracts-retrieval-response".coredata."dc:creator".author')
                  WHEN 'object' THEN json_array(json_extract(raw_json,'$."abstracts-retrieval-response".coredata."dc:creator".author'))
                  ELSE json_array()
                END
              )
            ) AS a
            WHERE name_full <> ''
            LIMIT 1
          )
        ) AS creator,

        /* creator_degree */
        COALESCE(
          (
            SELECT deg FROM (
              SELECT NULLIF(TRIM(json_extract(a.value,'$."ce:degrees"')), '') AS deg
              FROM json_each(
                CASE json_type(json_extract(raw_json,'$."abstracts-retrieval-response".item.coredata."dc:creator".author'))
                  WHEN 'array'  THEN json_extract(raw_json,'$."abstracts-retrieval-response".item.coredata."dc:creator".author')
                  WHEN 'object' THEN json_array(json_extract(raw_json,'$."abstracts-retrieval-response".item.coredata."dc:creator".author'))
                  ELSE json_array()
                END
              ) AS a
              UNION ALL
              SELECT NULLIF(TRIM(json_extract(a2.value,'$."ce:degrees"')), '') AS deg
              FROM json_each(
                CASE json_type(json_extract(raw_json,'$."abstracts-retrieval-response".coredata."dc:creator".author'))
                  WHEN 'array'  THEN json_extract(raw_json,'$."abstracts-retrieval-response".coredata."dc:creator".author')
                  WHEN 'object' THEN json_array(json_extract(raw_json,'$."abstracts-retrieval-response".coredata."dc:creator".author'))
                  ELSE json_array()
                END
              ) AS a2
            )
            WHERE deg IS NOT NULL AND deg <> ''
            LIMIT 1
          ),
          (
            SELECT NULLIF(TRIM(json_extract(ag_author.value,'$."ce:degrees"')), '')
            FROM json_each(
              CASE json_type(json_extract(raw_json,'$."abstracts-retrieval-response".item.bibrecord.head."author-group"'))
                WHEN 'array'  THEN json_extract(raw_json,'$."abstracts-retrieval-response".item.bibrecord.head."author-group"')
                WHEN 'object' THEN json_array(json_extract(raw_json,'$."abstracts-retrieval-response".item.bibrecord.head."author-group"'))
                ELSE json_array()
              END
            ) AS ag
            JOIN json_each(
              CASE json_type(json_extract(ag.value,'$.author'))
                WHEN 'array'  THEN json_extract(ag.value,'$.author')
                WHEN 'object' THEN json_array(json_extract(ag.value,'$.author'))
                ELSE json_array()
              END
            ) AS ag_author
            WHERE NULLIF(TRIM(json_extract(ag_author.value,'$."ce:degrees"')), '') IS NOT NULL
            AND json_extract(ag_author.value,'$."@auid"') = (
              SELECT auid FROM (
                SELECT json_extract(a.value,'$."@auid"') AS auid
                FROM json_each(
                  CASE json_type(json_extract(raw_json,'$."abstracts-retrieval-response".item.coredata."dc:creator".author'))
                    WHEN 'array'  THEN json_extract(raw_json,'$."abstracts-retrieval-response".item.coredata."dc:creator".author')
                    WHEN 'object' THEN json_array(json_extract(raw_json,'$."abstracts-retrieval-response".item.coredata."dc:creator".author'))
                    ELSE json_array()
                  END
                ) AS a
                UNION ALL
                SELECT json_extract(a2.value,'$."@auid"') AS auid
                FROM json_each(
                  CASE json_type(json_extract(raw_json,'$."abstracts-retrieval-response".coredata."dc:creator".author'))
                    WHEN 'array'  THEN json_extract(raw_json,'$."abstracts-retrieval-response".coredata."dc:creator".author')
                    WHEN 'object' THEN json_array(json_extract(raw_json,'$."abstracts-retrieval-response".coredata."dc:creator".author'))
                    ELSE json_array()
                  END
                ) AS a2
              )
              WHERE auid IS NOT NULL
              LIMIT 1
            )
            LIMIT 1
          )
        ) AS creator_degree,

        /* keywords */
        (
          SELECT json_group_array(kw_src.kw)
          FROM (
            SELECT json_extract(k.value,'$."$"') AS kw
            FROM json_each(
              CASE json_type(json_extract(raw_json,'$."abstracts-retrieval-response".item.bibrecord.head."citation-info"."author-keywords"."author-keyword"'))
                WHEN 'array'  THEN json_extract(raw_json,'$."abstracts-retrieval-response".item.bibrecord.head."citation-info"."author-keywords"."author-keyword"')
                WHEN 'object' THEN json_array(json_extract(raw_json,'$."abstracts-retrieval-response".item.bibrecord.head."citation-info"."author-keywords"."author-keyword"'))
                ELSE json_array()
              END
            ) AS k
            UNION ALL
            SELECT json_extract(k2.value,'$."$"') AS kw
            FROM json_each(
              CASE json_type(json_extract(raw_json,'$."abstracts-retrieval-response"."authkeywords"."author-keyword"'))
                WHEN 'array'  THEN json_extract(raw_json,'$."abstracts-retrieval-response"."authkeywords"."author-keyword"')
                WHEN 'object' THEN json_array(json_extract(raw_json,'$."abstracts-retrieval-response"."authkeywords"."author-keyword"'))
                ELSE json_array()
              END
            ) AS k2
          ) AS kw_src
          WHERE kw_src.kw IS NOT NULL
        ) AS keywords
      FROM papers_raw
    )
    SELECT
      base.*,
      COALESCE(json_array_length(base.allauthors_name), 0) AS allauthors_count
    FROM base
    ORDER BY year, file_id;
    """

    df = pd.read_sql_query(SQL, conn)
else:
    print("df already exists")


# Clean data

In [83]:
df.head()

Unnamed: 0,file_id,year,citation_title,abstracts,publishername,sourcetitle,publication_date,document_classification_codes,refcount,citedbycount,allauthors_name,categories,creator,creator_degree,keywords,allauthors_count
0,201800000,2018,Public health and international epidemiology f...,,Springer International Publishing,"Radiology in Global Health: Strategies, Implem...",31/12/2018,10.1007/978-3-319-98485-8_15,76,1.0,"[""Krit Pongpirul"",""Matthew P. Lungren""]","[{""Medicine (all)"":""MEDI""}]",Krit Pongpirul,PhD..,[],2
1,201800001,2018,Flexible Printed Active Antenna for Digital Te...,"© 2018 The Institute of Electronics, Informati...",Institute of Electrical and Electronics Engine...,Progress in Electromagnetics Research Symposium,31/12/2018,10.23919/PIERS.2018.8597669,4,1.0,"[""Teerapong Pratumsiri"",""Panuwat Janpugdee""]","[{""Electrical and Electronic Engineering"":""ENG...",Teerapong Pratumsiri,,[],2
2,201800002,2018,Parametric study of hydrogen production via so...,© 2018 Elsevier LtdComputational fluid dynamic...,Elsevier Ltd,Chemical Engineering Science,31/12/2018,10.1016/j.ces.2018.08.042,42,21.0,"[""Kiattikhoon Phuakpunk"",""Benjapon Chalermsins...","[{""Chemistry (all)"":""CHEM""},{""Chemical Enginee...",Kiattikhoon Phuakpunk,,"[""Circulating fluidized bed"",""Computational fl...",4
3,201800003,2018,Superhydrophobic coating from fluoroalkylsilan...,© 2018 Elsevier B.V. A superhydrophobic/supero...,Elsevier B.V.,Applied Surface Science,31/12/2018,10.1016/j.apsusc.2018.08.059,45,37.0,"[""Jittraporn Saengkaew"",""Duy Le"",""Chanatip Sam...","[{""Chemistry (all)"":""CHEM""},{""Condensed Matter...",Jittraporn Saengkaew,,"[""Encapsulation"",""Fluoroalkylsilane"",""Natural ...",8
4,201800004,2018,Electrochemical impedance-based DNA sensor usi...,© 2018 Elsevier B.V. A label-free electrochemi...,Elsevier B.V.,Analytica Chimica Acta,31/12/2018,10.1016/j.aca.2018.07.045,55,68.0,"[""Prinjaporn Teengam"",""Weena Siangproh"",""Adiso...","[{""Analytical Chemistry"":""CHEM""},{""Biochemistr...",Prinjaporn Teengam,,"[""acpcPNA"",""Electrochemical impedance spectros...",6


In [84]:
df_clean = df.drop(columns=["file_id","abstracts","document_classification_codes"])

In [85]:
#Check for na values and number of rows and columns remaining
print(df_clean.shape)
df_clean.isna().sum()

(20186, 13)


year                    0
citation_title          1
publishername          11
sourcetitle             0
publication_date     4105
refcount              411
citedbycount            4
allauthors_name         0
categories              0
creator                 0
creator_degree      18790
keywords                0
allauthors_count        0
dtype: int64

In [86]:
#Cleaning with publicationdate - Convert into numbers
if "publication_date" in df_clean.columns:
    df_clean["publication_date"] = pd.to_datetime(df_clean["publication_date"],format="%d/%m/%Y")
    latest_date = max(df_clean["publication_date"])
    df_clean["days_from_latest_date"] = latest_date-df_clean["publication_date"]
    df_clean.drop(columns=["publication_date"],inplace=True)
    df_clean["days_from_latest_date"] = df_clean["days_from_latest_date"].dt.days
    df_clean["days_from_latest_date"].value_counts()
else:
    print("Already done")
df_clean["days_from_latest_date"].value_counts()

days_from_latest_date
395.0     463
760.0     436
729.0     309
1125.0    284
364.0     269
         ... 
1906.0      1
1994.0      1
1314.0      1
1315.0      1
799.0       1
Name: count, Length: 1487, dtype: int64

In [87]:
#Cleaning for refcount. Refcount is dtype object so must be converted to int
df_clean.info()
df_clean["refcount"] = pd.to_numeric(df_clean["refcount"],errors="coerce").astype("Int64")
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20186 entries, 0 to 20185
Data columns (total 13 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   year                   20186 non-null  int64  
 1   citation_title         20185 non-null  object 
 2   publishername          20175 non-null  object 
 3   sourcetitle            20186 non-null  object 
 4   refcount               19775 non-null  object 
 5   citedbycount           20182 non-null  float64
 6   allauthors_name        20186 non-null  object 
 7   categories             20186 non-null  object 
 8   creator                20186 non-null  object 
 9   creator_degree         1396 non-null   object 
 10  keywords               20186 non-null  object 
 11  allauthors_count       20186 non-null  int64  
 12  days_from_latest_date  16081 non-null  float64
dtypes: float64(2), int64(2), object(9)
memory usage: 2.0+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex:

In [88]:
#Cleaning for 'citedbycount'
float_exist=False
for v in df_clean["citedbycount"]:
    if pd.isna(v)==True:
        continue
    if v%1==0:
        continue
    else:
        float_exist=True
print(float_exist)
#Float value doesnt exist so we turn column into Int64
df_clean["citedbycount"] = df_clean["citedbycount"].astype("Int64")
df_clean.info()

False
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20186 entries, 0 to 20185
Data columns (total 13 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   year                   20186 non-null  int64  
 1   citation_title         20185 non-null  object 
 2   publishername          20175 non-null  object 
 3   sourcetitle            20186 non-null  object 
 4   refcount               19775 non-null  Int64  
 5   citedbycount           20182 non-null  Int64  
 6   allauthors_name        20186 non-null  object 
 7   categories             20186 non-null  object 
 8   creator                20186 non-null  object 
 9   creator_degree         1396 non-null   object 
 10  keywords               20186 non-null  object 
 11  allauthors_count       20186 non-null  int64  
 12  days_from_latest_date  16081 non-null  float64
dtypes: Int64(2), float64(1), int64(2), object(8)
memory usage: 2.0+ MB


In [89]:
df_clean.head()

Unnamed: 0,year,citation_title,publishername,sourcetitle,refcount,citedbycount,allauthors_name,categories,creator,creator_degree,keywords,allauthors_count,days_from_latest_date
0,2018,Public health and international epidemiology f...,Springer International Publishing,"Radiology in Global Health: Strategies, Implem...",76,1,"[""Krit Pongpirul"",""Matthew P. Lungren""]","[{""Medicine (all)"":""MEDI""}]",Krit Pongpirul,PhD..,[],2,1826.0
1,2018,Flexible Printed Active Antenna for Digital Te...,Institute of Electrical and Electronics Engine...,Progress in Electromagnetics Research Symposium,4,1,"[""Teerapong Pratumsiri"",""Panuwat Janpugdee""]","[{""Electrical and Electronic Engineering"":""ENG...",Teerapong Pratumsiri,,[],2,1826.0
2,2018,Parametric study of hydrogen production via so...,Elsevier Ltd,Chemical Engineering Science,42,21,"[""Kiattikhoon Phuakpunk"",""Benjapon Chalermsins...","[{""Chemistry (all)"":""CHEM""},{""Chemical Enginee...",Kiattikhoon Phuakpunk,,"[""Circulating fluidized bed"",""Computational fl...",4,1826.0
3,2018,Superhydrophobic coating from fluoroalkylsilan...,Elsevier B.V.,Applied Surface Science,45,37,"[""Jittraporn Saengkaew"",""Duy Le"",""Chanatip Sam...","[{""Chemistry (all)"":""CHEM""},{""Condensed Matter...",Jittraporn Saengkaew,,"[""Encapsulation"",""Fluoroalkylsilane"",""Natural ...",8,1826.0
4,2018,Electrochemical impedance-based DNA sensor usi...,Elsevier B.V.,Analytica Chimica Acta,55,68,"[""Prinjaporn Teengam"",""Weena Siangproh"",""Adiso...","[{""Analytical Chemistry"":""CHEM""},{""Biochemistr...",Prinjaporn Teengam,,"[""acpcPNA"",""Electrochemical impedance spectros...",6,1826.0


# Cluster to Citation Impact Prediction

In [90]:
#Every numerical column has very little na values, so we can let XGBClassifier handle the na values, since na values can be informative. Series here represents percentage of missing values for each feature
(df_clean.isna().sum()/df_clean.shape[0])*100

year                      0.000000
citation_title            0.004954
publishername             0.054493
sourcetitle               0.000000
refcount                  2.036065
citedbycount              0.019816
allauthors_name           0.000000
categories                0.000000
creator                   0.000000
creator_degree           93.084316
keywords                  0.000000
allauthors_count          0.000000
days_from_latest_date    20.335876
dtype: float64

In [91]:
#Additional cleaning specific to this ML model
df_clean = df_clean[pd.notna(df_clean["citedbycount"])]

In [92]:
#Get target column
y = (df_clean["citedbycount"]>df_clean["citedbycount"].quantile(0.8)).astype(int)

In [93]:
#Define parser for json string for keywords column
import ast
def parse_keywords(x):
    if pd.isna(x):
        return[]
    if isinstance(x,list):
        return x
    else:
        return ast.literal_eval(x)



In [94]:
df_clean["comb_keywords"] = df_clean["keywords"].apply(lambda x: " ".join(parse_keywords(x)))
df_clean["combined_text"] = df_clean["citation_title"].fillna("") + " " +df_clean["sourcetitle"].fillna("") + " " + (df_clean["comb_keywords"])


In [95]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from math import inf

tfidf = TfidfVectorizer(max_features=5000,stop_words="english",ngram_range=(1,3))

text_matrix = tfidf.fit_transform(df_clean["combined_text"])

print(text_matrix.shape)
best_score = -inf
best_i=None
best_model = None
for i in range(10,21,3):
    km = KMeans(i,random_state=42,n_init=10)
    lab = km.fit_predict(text_matrix)
    score = silhouette_score(text_matrix,lab)
    if score > best_score:
        best_score = score
        best_i = i
        best_model= km


(20182, 5000)


In [96]:
df_clean.head()

Unnamed: 0,year,citation_title,publishername,sourcetitle,refcount,citedbycount,allauthors_name,categories,creator,creator_degree,keywords,allauthors_count,days_from_latest_date,comb_keywords,combined_text
0,2018,Public health and international epidemiology f...,Springer International Publishing,"Radiology in Global Health: Strategies, Implem...",76,1,"[""Krit Pongpirul"",""Matthew P. Lungren""]","[{""Medicine (all)"":""MEDI""}]",Krit Pongpirul,PhD..,[],2,1826.0,,Public health and international epidemiology f...
1,2018,Flexible Printed Active Antenna for Digital Te...,Institute of Electrical and Electronics Engine...,Progress in Electromagnetics Research Symposium,4,1,"[""Teerapong Pratumsiri"",""Panuwat Janpugdee""]","[{""Electrical and Electronic Engineering"":""ENG...",Teerapong Pratumsiri,,[],2,1826.0,,Flexible Printed Active Antenna for Digital Te...
2,2018,Parametric study of hydrogen production via so...,Elsevier Ltd,Chemical Engineering Science,42,21,"[""Kiattikhoon Phuakpunk"",""Benjapon Chalermsins...","[{""Chemistry (all)"":""CHEM""},{""Chemical Enginee...",Kiattikhoon Phuakpunk,,"[""Circulating fluidized bed"",""Computational fl...",4,1826.0,Circulating fluidized bed Computational fluid ...,Parametric study of hydrogen production via so...
3,2018,Superhydrophobic coating from fluoroalkylsilan...,Elsevier B.V.,Applied Surface Science,45,37,"[""Jittraporn Saengkaew"",""Duy Le"",""Chanatip Sam...","[{""Chemistry (all)"":""CHEM""},{""Condensed Matter...",Jittraporn Saengkaew,,"[""Encapsulation"",""Fluoroalkylsilane"",""Natural ...",8,1826.0,Encapsulation Fluoroalkylsilane Natural rubber...,Superhydrophobic coating from fluoroalkylsilan...
4,2018,Electrochemical impedance-based DNA sensor usi...,Elsevier B.V.,Analytica Chimica Acta,55,68,"[""Prinjaporn Teengam"",""Weena Siangproh"",""Adiso...","[{""Analytical Chemistry"":""CHEM""},{""Biochemistr...",Prinjaporn Teengam,,"[""acpcPNA"",""Electrochemical impedance spectros...",6,1826.0,acpcPNA Electrochemical impedance spectroscopy...,Electrochemical impedance-based DNA sensor usi...


In [97]:
X.head()

Unnamed: 0,year,refcount,allauthors_count,days_from_latest_date,topic_cluster
0,2018,76,2,1826.0,16
1,2018,4,2,1826.0,17
2,2018,42,4,1826.0,18
3,2018,45,8,1826.0,9
4,2018,55,6,1826.0,13


In [98]:
#We will now predict with label from clustering as an additional feature
from sklearn.model_selection import train_test_split,GridSearchCV
from xgboost import XGBClassifier

#Drop text columns.also drop citedbycount to prevent leakage
df_clean["topic_cluster"] = best_model.predict(text_matrix)
X=df_clean.drop(columns=["citation_title","sourcetitle","keywords","comb_keywords","combined_text","allauthors_name","categories","creator","publishername","creator_degree"])
X = X.drop(columns=["citedbycount"])
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42,stratify=y)

param_grid = {"max_depth":[3,5,7,9],"n_estimators":[200,400,800],"learning_rate":[0.01,0.05,0.1]}

cl_model = XGBClassifier(random_state=42)
grid = GridSearchCV(cl_model,param_grid=param_grid,n_jobs=-1,cv=5)
grid.fit(X_train,y_train)
best_xgb = grid.best_estimator_
from sklearn.metrics import classification_report
y_pred = best_xgb.predict(X_test)
print(classification_report(y_test,y_pred))


              precision    recall  f1-score   support

           0       0.88      0.95      0.91      3277
           1       0.68      0.45      0.54       760

    accuracy                           0.86      4037
   macro avg       0.78      0.70      0.73      4037
weighted avg       0.84      0.86      0.84      4037

