In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import chi2_contingency, pointbiserialr
from scipy.stats import chi2_contingency, f_oneway

In [None]:
# Load your dataset
df = pd.read_csv('/kaggle/input/onlytimestamp/fulltimestamponly.csv',parse_dates=True,on_bad_lines='skip')

In [None]:
df.head()

In [None]:
df.shape

In [None]:
num_cols = ['year', 'month', 'date']
cat_cols = ['Title', 'Author','publisher']
less_cat_cols = ['day','Group', 'Format','genre',]
target_col = 'rank'

In [None]:
# Convert timestamp to datetime
df['timestamp'] = pd.to_datetime(df['timestamp'])

# Sorting by timestamp for better trend visualization
df = df.sort_values(by='timestamp')

# Plot Rank Trend Over Time
plt.figure(figsize=(12, 5))
sns.lineplot(x='timestamp', y='rank', data=df, marker='o')
plt.gca().invert_yaxis()  # Lower rank is better, so invert the axis
plt.xlabel('Timestamp')
plt.ylabel('Rank')
plt.title('Book Rank Trend Over Time')
plt.xticks(rotation=45)
plt.show()

In [None]:
import pandas as pd
from scipy.stats import chi2_contingency
import seaborn as sns
import matplotlib.pyplot as plt

def check_correlation_categorical(df, target_column, cat_columns):
    results = {}

    for cat_column in cat_columns:
        contingency_table = pd.crosstab(df[cat_column], df[target_column])
        chi2, p_value, dof, expected = chi2_contingency(contingency_table)

        results[cat_column] = {
            "status": "Columns correlated" if p_value <= 0.05 else "Columns not correlated",
            "chi2_statistic": round(chi2, 3),
            "p_value": round(p_value, 5),
            "dof": dof
        }

    return results

# Call the function for `cat_cols`
chi_square_results = check_correlation_categorical(df, target_col, cat_cols)
print(chi_square_results)

# Print Chi-Square results in a structured format for cat_cols
print("\nChi-Square Test Results for cat_cols (Categorical vs Numerical):\n")
for col, values in chi_square_results.items():
    print(f"Column: {col}")
    print(f"  Status         : {values['status']}")
    print(f"  Chi2-Statistic : {values['chi2_statistic']}")
    print(f"  P-Value        : {values['p_value']}")
    print(f"  Degrees of Freedom : {values['dof']}")
    print("-" * 40)

In [None]:
from sklearn.preprocessing import LabelEncoder
from scipy.stats import pearsonr

# Encoding categorical variables
encoder = LabelEncoder()
df1_encoded = df1.copy()
df1_encoded["Author"] = encoder.fit_transform(df1["Author"])
df1_encoded["Title"] = encoder.fit_transform(df1["Title"])
df1_encoded["publisher"] = encoder.fit_transform(df1["publisher"])

# Calculating correlation
correlation_matrix = df1_encoded.corr()

# Checking correlation with p-values and printing status
print("Correlation Analysis with Rank:\n")
for col in ["Author", "Title", "publisher"]:
    corr, pval = pearsonr(df1_encoded["rank"], df1_encoded[col])
    
    # Determine significance
    if pval < 0.05:
        status = "Correlation"
    else:
        status = "No Correlation"
    
    print(f"{col}: Correlation = {corr:.4f}, p-value = {pval:.4f} → {status}")



# Interpretation Guide:
# - If p-value < 0.05 → Statistically significant correlation
# - If p-value >= 0.05 → No significant correlation

In [None]:
def check_anova(df, target_column, cat_columns):
    results = {}

    for cat_column in cat_columns:
        grp_data = df.groupby(cat_column)[target_column].apply(list)

        if len(grp_data) > 1:  # Ensure at least two groups for ANOVA
            f_statistic, p_value = f_oneway(*grp_data)

            results[cat_column] = {
                "status": "Columns correlated" if p_value <= 0.05 else "Columns not correlated",
                "f_statistic": round(f_statistic, 3),
                "p_value": round(p_value, 5)
            }

    return results

# Call the function for `less_cat_cols`
anova_results = check_anova(df, target_col, less_cat_cols)
print(anova_results)


In [None]:
# Select 20 random ASINs
random_asins = df['asin'].dropna().unique()
random_asins = np.random.choice(random_asins, size=min(20, len(random_asins)), replace=False)
random_df = df[df['asin'].isin(random_asins)]

# Plot trend graph separately for each ASIN
for asin in random_asins:
    plt.figure(figsize=(10, 5))
    asin_df = random_df[random_df['asin'] == asin]
    sns.lineplot(x='timestamp', y='rank', data=asin_df, marker='o')
    plt.gca().invert_yaxis()
    plt.xlabel('Timestamp')
    plt.ylabel('Rank')
    plt.title(f'Rank Trend Over Time for ASIN: {asin}')
    plt.xticks(rotation=45)
    plt.show()


In [None]:
# Select 20 random ASINs
random_asins = df['asin'].dropna().unique()
random_asins = np.random.choice(random_asins, size=min(20, len(random_asins)), replace=False)
random_df = df[df['asin'].isin(random_asins)]

# Plot trend graph separately for each ASIN based on Month
for asin in random_asins:
    plt.figure(figsize=(10, 5))
    asin_df = random_df[random_df['asin'] == asin]
    sns.lineplot(x='month', y='rank', data=asin_df, marker='o')
    plt.gca().invert_yaxis()
    plt.xlabel('Month')
    plt.ylabel('Rank')
    plt.title(f'Rank Trend Over Months for ASIN: {asin}')
    plt.xticks(rotation=45)
    plt.show()

In [None]:
# Select 20 random ASINs
random_asins = df['asin'].dropna().unique()
random_asins = np.random.choice(random_asins, size=min(20, len(random_asins)), replace=False)
random_df = df[df['asin'].isin(random_asins)]

# Plot trend graph separately for each ASIN based on Date
for asin in random_asins:
    plt.figure(figsize=(10, 5))
    asin_df = random_df[random_df['asin'] == asin]
    sns.lineplot(x='date', y='rank', data=asin_df, marker='o')
    plt.gca().invert_yaxis()
    plt.xlabel('Date')
    plt.ylabel('Rank')
    plt.title(f'Rank Trend Over Dates for ASIN: {asin}')
    plt.xticks(rotation=45)
    plt.show()


In [None]:
# Define day order
day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
df['day'] = pd.Categorical(df['day'], categories=day_order, ordered=True)

# Select 20 random ASINs
random_asins = df['asin'].dropna().unique()
random_asins = np.random.choice(random_asins, size=min(20, len(random_asins)), replace=False)
random_df = df[df['asin'].isin(random_asins)]

# Plot trend graph separately for each ASIN based on Day of the Week
for asin in random_asins:
    plt.figure(figsize=(10, 5))
    asin_df = random_df[random_df['asin'] == asin].sort_values('day')
    sns.lineplot(x='day', y='rank', data=asin_df, marker='o')
    plt.gca().invert_yaxis()
    plt.xlabel('Day of the Week')
    plt.ylabel('Rank')
    plt.title(f'Rank Trend Over Days of the Week for ASIN: {asin}')
    plt.xticks(rotation=45)
    plt.show()
