In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/arxiv/arxiv-metadata-oai-snapshot.json


In [2]:
import os
import re
import nltk
import pandas as pd
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from tqdm import tqdm

In [3]:
import nltk
import subprocess

# Download and unzip wordnet
try:
    nltk.data.find('wordnet.zip')
    nltk.data.find('stopwords.zip')
except:
    nltk.download('wordnet', download_dir='/kaggle/working/')
    nltk.download('stopwords', download_dir='/kaggle/working/')
    command = "unzip /kaggle/working/corpora/wordnet.zip -d /kaggle/working/corpora"
    subprocess.run(command.split())
    nltk.data.path.append('/kaggle/working/')

# Now you can import the NLTK resources as usual
from nltk.corpus import wordnet
from nltk.corpus import stopwords

[nltk_data] Downloading package wordnet to /kaggle/working/...
[nltk_data] Downloading package stopwords to /kaggle/working/...
[nltk_data]   Unzipping corpora/stopwords.zip.
Archive:  /kaggle/working/corpora/wordnet.zip
   creating: /kaggle/working/corpora/wordnet/
  inflating: /kaggle/working/corpora/wordnet/lexnames  
  inflating: /kaggle/working/corpora/wordnet/data.verb  
  inflating: /kaggle/working/corpora/wordnet/index.adv  
  inflating: /kaggle/working/corpora/wordnet/adv.exc  
  inflating: /kaggle/working/corpora/wordnet/index.verb  
  inflating: /kaggle/working/corpora/wordnet/cntlist.rev  
  inflating: /kaggle/working/corpora/wordnet/data.adj  
  inflating: /kaggle/working/corpora/wordnet/index.adj  
  inflating: /kaggle/working/corpora/wordnet/LICENSE  
  inflating: /kaggle/working/corpora/wordnet/citation.bib  
  inflating: /kaggle/working/corpora/wordnet/noun.exc  
  inflating: /kaggle/working/corpora/wordnet/verb.exc  
  inflating: /kaggle/working/corpora/wordnet/README

In [4]:
def preprocess_text(text, stop_words, lemmatizer):
    if pd.isna(text):
        return ''

    # Cleaning operations
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\b\w{1,2}\b', '', text)
    text = re.sub(r'[^A-Za-z0-9]+', ' ', text)

    words = word_tokenize(text.lower())
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]


    return ' '.join(words)



def preprocess_data(input_file, output_file, chunk_size=1000):
    stop_words = set(stopwords.words('english'))
    custom_stopwords = [
        "ability", "able", "absolute", "absolutely", "account", "accurate", "achieve", "address",
        "allowing", "also", "analyze", "analyzes", "answer", "application", "approach",
        "around", "article", "aspect", "audience", "author", "available", "based", "begin", "best", "better",
        "beyond", "bound", "brief", "called", "capable", "capture", "carefully", "case", "certain", "challenging",
        "compare", "complex", "component", "comprehensive", "concept", "conceptual",
        "conclusion", "condition", "conduct", "conjecture", "consider", "construct", "content", "context", "cost",
        "cross", "crucial", "current", "demonstrate", "derive", "derived", "describe",
        "described", "describes", "detailed", "determine", "developed", "different", "difficult", "directly",
        "discourse", "discuss", "distinguish", "driven", "due", "effect", "effective", "efficient", "efficiently",
        "eight", "element", "emphasis", "end", "enhanced", "evaluate", "even", "example", "experiment", "experimental",
        "explain", "extensive", "family", "feature", "figure", "find", "fine", "finite", "finitely", "first",
        "fit", "five", "found", "four", "framework", "function", "fundamental", "future", "general", "give",
        "given", "good", "grained", "graph", "group", "handed", "high", "higher", "however", "illustrate", "impact",
        "implement", "important", "include", "included", "integrate", "interest", "introduce", "introduced", "introduction",
        "investigate", "issue", "iteration", "known", "large", "last", "leading", "left", "let", "long", "low", "lower", "make",
        "many", "maximal", "may", "method", "methodology", "minimal", "model", "moreover", "multiple",
        "necessary", "need", "needed", "new", "news", "next", "nine", "non", "note", "novel", "number", "numerical",
        "objective", "observables", "observation", 'obtained', "often", "one", "open", "operator", "optimal", "order",
        "outline",
        "outlines", "output", "paper", "papr", "parameter", "part", "particular", "perform",
        "performance", "performed", "performing", "performance", "phase", "point", "possible",
        "potential", "pre", "precisely", "present", "previous", "principle", "problem", "process", "prof",
        "proof", "proper", "property", "propose", "proposed", "proposes", "prove", "provide", "provided",
        "publicly", "publish", "purpose", "quality", "question", "range", "real", "recent", "recently",
        "recommendation", "related", "reliable", "representation", "require", "research", "result", "rev", "review",
        "right", "rigorous", "role", "scale", "scenario", "second", "section", "selection", "series", "serious", "set", "setting",
        "seven", "show", "significant", "significantly", "simulation", "single", "six", "solution", "state",
        "strongly", "structure", "studied", "study", "sufficient", "suggestion", "sum", "synthesize", "system",
        "table", "take", "taken", "task", "technique", "ten", "term", "theorem", "theory", "third",
        "though", "three", "time", "topic", "two", "type", "upper", "use", "used", "using", "utilize", "valid",
        "value", "variable", "variety", "various", "via", "view", "way", "well", "whether", "wide", "widely", "within",
        "without", "work", "world", "written", "year", "zero", "zeroth"]
    stop_words.update(custom_stopwords)
    lemmatizer = WordNetLemmatizer()

    with open(input_file, 'r') as f:
        total_rows = sum(1 for _ in f)

    output_exists = os.path.isfile(output_file)

    with tqdm(total=total_rows) as pbar:
        for chunk in pd.read_json(input_file, lines=True, chunksize=chunk_size):
            # drop duplicates
            chunk = chunk[['title', 'abstract', 'categories', 'update_date']].drop_duplicates(
                subset=['title', 'abstract'])
            # preprocess text 
            chunk['title'] = chunk['title'].apply(preprocess_text, args=(stop_words, lemmatizer))
            chunk['abstract'] = chunk['abstract'].apply(preprocess_text, args=(stop_words, lemmatizer))
            # combine title and abstract into one column and drop original columns
            chunk['text'] = chunk['title'] + ' ' + chunk['abstract']
            chunk = chunk.drop(['title', 'abstract'], axis=1)
            chunk = chunk[chunk['text'] != '']
            # split categories into list of categories and save number of categories in a separate column and drop original column
            chunk['categories_list'] = chunk['categories'].str.split()
            chunk['num_categories'] = chunk['categories_list'].apply(len)
            chunk = chunk.drop(['categories'], axis=1)
            # convert update_date to datetime
            chunk['update_date'] = pd.to_datetime(chunk['update_date'])
            # save to csv
            if not output_exists:
                chunk.to_csv(output_file, mode='w', index=False)
                output_exists = True
            else:
                chunk.to_csv(output_file, mode='a', index=False, header=False)

            pbar.update(chunk_size)

In [5]:
raw_path = '/kaggle/input/arxiv/arxiv-metadata-oai-snapshot.json'
processed_path = '/kaggle/working/data_preprocessed.csv'
preprocess_data(raw_path, processed_path)

2327000it [44:21, 874.32it/s]                              


In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from umap import UMAP

  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()


In [3]:
input_file = '/kaggle/working/data_preprocessed.csv'
df = pd.read_csv(input_file)
df.head()

Unnamed: 0,update_date,text,categories_list,num_categories
0,2008-11-26,calculation prompt diphoton production section...,['hep-ph'],1
1,2008-12-13,sparsity certifying decomposition algorithm el...,"['math.CO', 'cs.CG']",2
2,2008-01-13,evolution earth moon dark matter field fluid e...,['physics.gen-ph'],1
3,2007-05-23,determinant stirling cycle number count unlabe...,['math.CO'],1
4,2013-10-15,dyadic lambda alpha lambda alpha compute lambd...,"['math.CA', 'math.FA']",2


In [4]:
def vectorize_text(df):
    # Instantiate TfidfVectorizer
    vectorizer = TfidfVectorizer(max_features=1000, ngram_range=(1,2))
    X = vectorizer.fit_transform(df['text'])
    return X, vectorizer

tfidf_matrix, vectorizer = vectorize_text(df)

In [5]:
# Data Scaling
scaler = StandardScaler(with_mean=False)
X_scaled = scaler.fit_transform(tfidf_matrix)

In [8]:
# Dimensionality reduction using LSA
def reduce_dimensions_lsa(X, n_components=150):
    lsa = TruncatedSVD(n_components=n_components)
    X_lsa = lsa.fit_transform(X)
    return X_lsa

X_lsa = reduce_dimensions_lsa(X_scaled)

In [None]:
# Taking a random sample of 20% of the data
sample_indices = np.random.choice(X_lsa.shape[0], int(0.2 * X_lsa.shape[0]), replace=False)
X_sample = X_lsa[sample_indices]

# calculate the sum of squared distances for several k
fig = plt.figure(figsize=(15,10))
wcss={}
for i in range(2,16):
    kmeans = KMeans(n_clusters=i, n_init=10, random_state=42).fit(X_sample)
    wcss[i] = kmeans.inertia_

# Plot the results
plt.plot(list(wcss.keys()),list(wcss.values()))
plt.xlabel('Values for K')
plt.ylabel('Sum of Squared Distances')
plt.title('Elbow Method')
plt.xticks(list(wcss.keys()))
plt.grid(True)
plt.savefig('../images/elbow.png')
plt.show()

In [None]:
def determine_optimal_clusters(X):
    silhouette_scores = []
    davies_bouldin_scores = []


    # Taking a random sample of 1% of the data
    sample_indices = np.random.choice(X.shape[0], int(0.1 * X.shape[0]), replace=False)
    X_sample = X[sample_indices]

    # Define a range of clusters to test
    cluster_range = range(2, 21)

    for i in cluster_range:
        kmeans = KMeans(n_clusters=i, n_init=10, random_state=0)
        kmeans.fit(X_sample)

        # Calculate silhouette score
        silhouette_avg = silhouette_score(X_sample, kmeans.labels_)
        silhouette_scores.append(silhouette_avg)


        # Calculate Davies-Bouldin index
        davies_bouldin_avg = davies_bouldin_score(X_sample, kmeans.labels_)
        davies_bouldin_scores.append(davies_bouldin_avg)

        # Print the score for each cluster
        print("For n_clusters = {}, silhouette score is {})".format(i, silhouette_avg))
        print("For n_clusters = {}, Davies-Bouldin score is {})".format(i, davies_bouldin_avg))


    # Plot the scores 
    plt.figure(figsize=(15, 5))

    # Plot Silhouette Score
    plt.subplot(1, 2, 1)
    plt.plot(cluster_range, silhouette_scores, marker='o', linestyle='-', color='g')
    plt.xticks(cluster_range)
    plt.title('Silhouette Score')
    plt.xlabel('Number of clusters')
    plt.ylabel('Score')
    plt.grid(True)
    plt.savefig('../images/silhouette.png')

    # Plot Davies-Bouldin Score
    plt.subplot(1, 2, 2)
    plt.plot(cluster_range, davies_bouldin_scores, marker='o', linestyle='-', color='r')
    plt.xticks(cluster_range)
    plt.title('Davies-Bouldin Score')
    plt.xlabel('Number of clusters')
    plt.ylabel('Score')
    plt.grid(True)
    plt.savefig('../images/davies.png')
    
    plt.show()

# Call the function to determine optimal clusters
determine_optimal_clusters(X_lsa)