In [10]:
import requests
import csv
import random
import io

# URL of the API
api_url = "https://ds-de-project.vercel.app/papers/csv"

# Fetch the CSV from the API
response = requests.get(api_url, headers= {"X-API-Key": "a8c22b2d-21c7-4a8d-8a26-bd5f3e5e6d21"})

if response.status_code == 200:
    # Read the CSV content
    csv_content = response.text
    csv_reader = csv.DictReader(io.StringIO(csv_content))

    # Prepare the new CSV with only the required columns
    selected_columns = ["title", "2020 - 2023", "document"]
    modified_csv_file = "modified_papers.csv"

    with open(modified_csv_file, mode="w", newline="", encoding="utf-8") as file:
        csv_writer = csv.DictWriter(file, fieldnames=selected_columns)
        csv_writer.writeheader()

        for row in csv_reader:
            # Create the modified row
            modified_row = {
                "title": row["title"],
                "2020 - 2023": row.get("2020 - 2023", 0),  # Default to 0 if not found
                "document": random.randint(100, 400),  # Add a random number between 100-400
            }
            csv_writer.writerow(modified_row)

    print(f"Modified CSV saved as {modified_csv_file}")
else:
    print(f"Failed to fetch CSV. Status code: {response.status_code}, Error: {response.text}")

Modified CSV saved as modified_papers.csv


In [5]:
import pandas as pd
df = pd.read_csv('modified_papers.csv')
df.head()

Unnamed: 0,title,2020 - 2023,document
0,GENEVA Risk and Insurance Review,141,342
1,Marine Resource Economics,376,138
2,American Journal of Health Economics,333,383
3,Asian Journal of Technology Innovation,456,339
4,Cambridge Journal of Economics,904,267


In [None]:
import pandas as pd
df = pd.read_csv('modified_papers.csv')
modified_df = df[['2020 - 2023', 'document']]
modified_df

Unnamed: 0,2020 - 2023,document
0,141,342
1,376,138
2,333,383
3,456,339
4,904,267
...,...,...
96,3,186
97,22,225
98,12,212
99,23,175


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# Define the data

# Create DataFrame
df = pd.read_csv('modified_papers.csv')
# df = pd.DataFrame(data)
print("Original Data:")
print(df)

# Features for clustering
features = df[['2020 - 2023', 'document']]

# Initialize the scaler
scaler = StandardScaler()

# Fit and transform the features
scaled_features = scaler.fit_transform(features)

# Convert back to DataFrame for easier handling
scaled_df = pd.DataFrame(scaled_features, columns=['2020 - 2023 (scaled)', 'Document (scaled)'])
print("\nScaled Features:")
print(scaled_df)

# Elbow Method to find the optimal number of clusters
inertia = [] # store the inertia values เป็นค่าผลรวมของระยะห่างจากทุกจุดไปยังจุดศูนย์กลางนั้น ๆ
k_range = range(1, 7) # test different klusters from 1 to 6
for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(scaled_features) # fit the scaled data
    inertia.append(kmeans.inertia_) # append the inertia value

# Plot the Elbow Method results using Plotly
fig_elbow = px.line(x=list(k_range), y=inertia, markers=True, # plot the line chart x = k_range, y = inertia
                    title='Elbow Method for Determining Optimal k',
                    labels={'x': 'Number of clusters (k)', 'y': 'Inertia'})
fig_elbow.update_layout(xaxis=dict(tickmode='linear'))
fig_elbow.show()

# From the Elbow plot, choose k=2
optimal_k = 3
kmeans = KMeans(n_clusters=optimal_k, random_state=42) # set the number of clusters to 2
kmeans.fit(scaled_features) # fit the scaled data
df['Cluster'] = kmeans.labels_ 

print("\nCluster Assignments:")
print(df[['title', 'Cluster']]) # print the title and cluster

# Prepare data for plotting
centroids = scaler.inverse_transform(kmeans.cluster_centers_)
centroid_df = pd.DataFrame(centroids, columns=['2020 - 2023', 'document'])
centroid_df['Cluster'] = range(optimal_k)

# Create a color map for clusters
color_map = {0: 'blue', 1: 'green'}

# Plot the clusters using Plotly
fig_clusters = px.scatter(
    df, 
    x='2020 - 2023', 
    y='document',  # Fixed to match the actual column name
    color='Cluster',
    color_discrete_map=color_map,  # Use color_discrete_map instead of colors
    hover_data=['title'],  # Fixed to match the column name
    title='K-Means Clustering of Journals',
    labels={'2020 - 2023': '2020 - 2023', 'document': 'Document'},
)

# Add centroids to the plot
fig_clusters.add_trace(go.Scatter(
    x=centroid_df['2020 - 2023'],
    y=centroid_df['document'],
    mode='markers',
    marker=dict(color='red', size=15, symbol='x'),
    name='Centroids'
))

# Optionally, add text labels for each point
for idx, row in df.iterrows():
    fig_clusters.add_annotation(
        x=row['2020 - 2023'] + 10, 
        y=row['document'] + 10,
        text=row['title'],  # Fixed to match the column name
        showarrow=False,
        font=dict(size=10)
    )

fig_clusters.update_layout(legend_title_text='Cluster')
fig_clusters.show()

Original Data:
                                                 title  2020 - 2023  document
0                     GENEVA Risk and Insurance Review          141       342
1                            Marine Resource Economics          376       138
2                 American Journal of Health Economics          333       383
3               Asian Journal of Technology Innovation          456       339
4                       Cambridge Journal of Economics          904       267
..                                                 ...          ...       ...
96   International Journal of Marketing Semiotics a...            3       186
97   Competitive Government: Public Private Partner...           22       225
98   Public Administration, Governance and Globaliz...           12       212
99   Revista del Ministerio de Trabajo y Economia S...           23       175
100             Zeitschrift fur Unternehmensgeschichte           12       389

[101 rows x 3 columns]

Scaled Features:
     20


Cluster Assignments:
                                                 title  Cluster
0                     GENEVA Risk and Insurance Review        0
1                            Marine Resource Economics        1
2                 American Journal of Health Economics        0
3               Asian Journal of Technology Innovation        0
4                       Cambridge Journal of Economics        0
..                                                 ...      ...
96   International Journal of Marketing Semiotics a...        1
97   Competitive Government: Public Private Partner...        1
98   Public Administration, Governance and Globaliz...        1
99   Revista del Ministerio de Trabajo y Economia S...        1
100             Zeitschrift fur Unternehmensgeschichte        0

[101 rows x 2 columns]
