In [120]:
import pandas as pd
import plotly.express as px
from scipy.stats import linregress

In [121]:
# Load Data
file_path = "output/Citedby_Prediction_Data.csv/part-00000-5ec6ac9c-e7d2-4c0a-9c7d-ca4ddfdff8c7-c000.csv"
df = pd.read_csv(file_path)

In [122]:
# Create Scatter Plot
fig = px.scatter(df, 
                x="prediction", 
                y="citedby_count",
                labels={"citedby_count": "Cited By Count", "prediction": "Predicted Cited By Count"},
                title="Scatter Plot of Cited By Count vs Predicted Cited By Count")
fig.update_traces(marker=dict(color='rgba(31, 119, 180, 0.25)'))
fig.show()

In [123]:
slope, intercept, r_value, p_value, std_err = linregress(df['prediction'], df['citedby_count'])

fig.add_trace(go.Scatter(
    x=df['prediction'],
    y=intercept + slope * df['prediction'],
    mode='lines',
    name=f'Best Fit Line (R^2 = {r_value**2:.2f})'
))

In [124]:
fig.update_layout(
    title='Scatter Plot of Cited By Count vs Predicted Cited By Count',
    xaxis_title='',
    yaxis_title='',
    font=dict(
        family="Courier New, monospace",
        size=14,
        color="RebeccaPurple"
    ),
    annotations=[
        go.layout.Annotation(
            x=0.5,
            y=-0.2,
            showarrow=False,
            text=f'R-squared: {r_value**2:.2f}, p-value: {p_value:.2f}',
            xref='paper',
            yref='paper'
        )
    ]
)

In [125]:
fig.show()