In [4]:
import polars as pl

# === STEP 1: Load the dataset using Polars ===
df = pl.read_csv("fb_posts_cleaned.csv")
print("Dataset loaded. Shape:", df.shape)

# === STEP 2: Descriptive Statistics for Numeric Columns ===
print("\n=== Overall Descriptive Statistics (numeric) ===")
print(df.describe())

# === STEP 3: Unique value counts per column ===
print("\n=== Number of Unique Values per Column ===")
for col in df.columns:
    unique_count = df[col].n_unique()
    print(f"{col}: {unique_count}")

# === STEP 4: Most Frequent Value for Each Non-Numeric Column ===
print("\n=== Most Frequent Values for Non-Numeric Columns ===")
non_numeric_cols = [col for col, dtype in df.schema.items() if dtype == pl.Utf8]
for col in non_numeric_cols:
    most_common = (
        df.select(pl.col(col).value_counts().sort(by="counts", descending=True).limit(1))
    )
    print(f"{col}:")
    print(most_common)

# === STEP 5: Grouped Stats by Facebook_Id (Only numeric columns) ===
numeric_cols = [col for col, dtype in df.schema.items() if dtype in [pl.Float64, pl.Float32, pl.Int64, pl.Int32]]
print("\n=== Grouped Stats by Facebook_Id (first 3 shown) ===")
grouped_page = (
    df.group_by("Facebook_Id")
      .agg([
          *[pl.col(col).count().alias(f"{col}_count") for col in numeric_cols],
          *[pl.col(col).mean().alias(f"{col}_mean") for col in numeric_cols],
          *[pl.col(col).min().alias(f"{col}_min") for col in numeric_cols],
          *[pl.col(col).max().alias(f"{col}_max") for col in numeric_cols],
          *[pl.col(col).std().alias(f"{col}_std") for col in numeric_cols]
      ])
)
print(grouped_page.head(3))

# === STEP 6: Grouped Stats by Facebook_Id and post_id (Only numeric columns) ===
print("\n=== Grouped Stats by Facebook_Id and post_id (first 3 shown) ===")
grouped_page_post = (
    df.group_by(["Facebook_Id", "post_id"])
      .agg([
          *[pl.col(col).count().alias(f"{col}_count") for col in numeric_cols],
          *[pl.col(col).mean().alias(f"{col}_mean") for col in numeric_cols],
          *[pl.col(col).min().alias(f"{col}_min") for col in numeric_cols],
          *[pl.col(col).max().alias(f"{col}_max") for col in numeric_cols],
          *[pl.col(col).std().alias(f"{col}_std") for col in numeric_cols]
      ])
)
print(grouped_page_post.head(3))


Dataset loaded. Shape: (19009, 53)

=== Overall Descriptive Statistics (numeric) ===
shape: (9, 54)
┌────────────┬──────────────┬──────────────┬──────────────┬───┬──────────────┬─────────────┬─────────────┬─────────────┐
│ statistic  ┆ Facebook_Id  ┆ post_id      ┆ Page         ┆ … ┆ scam_illumin ┆ freefair_il ┆ fraud_illum ┆ Post        │
│ ---        ┆ ---          ┆ ---          ┆ Category     ┆   ┆ ating        ┆ luminating  ┆ inating     ┆ Created     │
│ str        ┆ str          ┆ str          ┆ ---          ┆   ┆ ---          ┆ ---         ┆ ---         ┆ Timestamp   │
│            ┆              ┆              ┆ str          ┆   ┆ f64          ┆ f64         ┆ f64         ┆ ---         │
│            ┆              ┆              ┆              ┆   ┆              ┆             ┆             ┆ str         │
╞════════════╪══════════════╪══════════════╪══════════════╪═══╪══════════════╪═════════════╪═════════════╪═════════════╡
│ count      ┆ 19009        ┆ 19009        ┆ 19009   

TypeError: Expr.sort() got an unexpected keyword argument 'by'

In [6]:
pip install polars

Note: you may need to restart the kernel to use updated packages.


In [8]:
import polars as pl

# === STEP 1: Load the dataset using Polars ===
df = pl.read_csv("fb_posts_cleaned.csv")
print("Dataset loaded. Shape:", df.shape)

# === STEP 2: Descriptive Statistics for Numeric Columns ===
print("\n=== Overall Descriptive Statistics (numeric) ===")
print(df.describe())

# === STEP 3: Unique value counts per column ===
print("\n=== Number of Unique Values per Column ===")
for col in df.columns:
    unique_count = df[col].n_unique()
    print(f"{col}: {unique_count}")

# === STEP 4: Most Frequent Value for Each Non-Numeric Column ===
print("\n=== Most Frequent Values for Non-Numeric Columns ===")
non_numeric_cols = [col for col, dtype in df.schema.items() if dtype == pl.Utf8]
for col in non_numeric_cols:
    most_common = (
        df.select(pl.col(col).value_counts().sort("counts", reverse=True).limit(1))
    )
    print(f"{col}:")
    print(most_common)

# === STEP 5: Grouped Stats by Facebook_Id (Only numeric columns) ===
numeric_cols = [col for col, dtype in df.schema.items() if dtype in [pl.Float64, pl.Float32, pl.Int64, pl.Int32]]
print("\n=== Grouped Stats by Facebook_Id (first 3 shown) ===")
grouped_page = (
    df.group_by("Facebook_Id")
      .agg([
          *[pl.col(col).count().alias(f"{col}_count") for col in numeric_cols],
          *[pl.col(col).mean().alias(f"{col}_mean") for col in numeric_cols],
          *[pl.col(col).min().alias(f"{col}_min") for col in numeric_cols],
          *[pl.col(col).max().alias(f"{col}_max") for col in numeric_cols],
          *[pl.col(col).std().alias(f"{col}_std") for col in numeric_cols]
      ])
)
print(grouped_page.head(3))

# === STEP 6: Grouped Stats by Facebook_Id and post_id (Only numeric columns) ===
print("\n=== Grouped Stats by Facebook_Id and post_id (first 3 shown) ===")
grouped_page_post = (
    df.group_by(["Facebook_Id", "post_id"])
      .agg([
          *[pl.col(col).count().alias(f"{col}_count") for col in numeric_cols],
          *[pl.col(col).mean().alias(f"{col}_mean") for col in numeric_cols],
          *[pl.col(col).min().alias(f"{col}_min") for col in numeric_cols],
          *[pl.col(col).max().alias(f"{col}_max") for col in numeric_cols],
          *[pl.col(col).std().alias(f"{col}_std") for col in numeric_cols]
      ])
)
print(grouped_page_post.head(3))


Dataset loaded. Shape: (19009, 53)

=== Overall Descriptive Statistics (numeric) ===
shape: (9, 54)
┌────────────┬──────────────┬──────────────┬──────────────┬───┬──────────────┬─────────────┬─────────────┬─────────────┐
│ statistic  ┆ Facebook_Id  ┆ post_id      ┆ Page         ┆ … ┆ scam_illumin ┆ freefair_il ┆ fraud_illum ┆ Post        │
│ ---        ┆ ---          ┆ ---          ┆ Category     ┆   ┆ ating        ┆ luminating  ┆ inating     ┆ Created     │
│ str        ┆ str          ┆ str          ┆ ---          ┆   ┆ ---          ┆ ---         ┆ ---         ┆ Timestamp   │
│            ┆              ┆              ┆ str          ┆   ┆ f64          ┆ f64         ┆ f64         ┆ ---         │
│            ┆              ┆              ┆              ┆   ┆              ┆             ┆             ┆ str         │
╞════════════╪══════════════╪══════════════╪══════════════╪═══╪══════════════╪═════════════╪═════════════╪═════════════╡
│ count      ┆ 19009        ┆ 19009        ┆ 19009   

TypeError: Expr.sort() got an unexpected keyword argument 'reverse'

In [10]:
import polars as pl

# === STEP 1: Load the dataset using Polars ===
df = pl.read_csv("fb_posts_cleaned.csv")
print("Dataset loaded. Shape:", df.shape)

# === STEP 2: Descriptive Statistics for Numeric Columns ===
print("\n=== Overall Descriptive Statistics (numeric) ===")
print(df.describe())

# === STEP 3: Unique value counts per column ===
print("\n=== Number of Unique Values per Column ===")
for col in df.columns:
    unique_count = df[col].n_unique()
    print(f"{col}: {unique_count}")

# === STEP 4: Most Frequent Value for Each Non-Numeric Column
print("\n=== Most Frequent Values for Non-Numeric Columns ===")
non_numeric_cols = [col for col, dtype in df.schema.items() if dtype == pl.Utf8]
for col in non_numeric_cols:
    # Get value counts
    vc = df.select(pl.col(col).value_counts())
    
    # Sort manually using safe syntax (descending=True keyword works here)
    vc_sorted = vc.sort("counts", descending=True).limit(1)
    
    print(f"{col}:")
    print(vc_sorted)

# === STEP 5: Grouped Stats by Facebook_Id (Only numeric columns) ===
numeric_cols = [col for col, dtype in df.schema.items() if dtype in [pl.Float64, pl.Float32, pl.Int64, pl.Int32]]
print("\n=== Grouped Stats by Facebook_Id (first 3 shown) ===")
grouped_page = (
    df.group_by("Facebook_Id")
      .agg([
          *[pl.col(col).count().alias(f"{col}_count") for col in numeric_cols],
          *[pl.col(col).mean().alias(f"{col}_mean") for col in numeric_cols],
          *[pl.col(col).min().alias(f"{col}_min") for col in numeric_cols],
          *[pl.col(col).max().alias(f"{col}_max") for col in numeric_cols],
          *[pl.col(col).std().alias(f"{col}_std") for col in numeric_cols]
      ])
)
print(grouped_page.head(3))

# === STEP 6: Grouped Stats by Facebook_Id and post_id (Only numeric columns) ===
print("\n=== Grouped Stats by Facebook_Id and post_id (first 3 shown) ===")
grouped_page_post = (
    df.group_by(["Facebook_Id", "post_id"])
      .agg([
          *[pl.col(col).count().alias(f"{col}_count") for col in numeric_cols],
          *[pl.col(col).mean().alias(f"{col}_mean") for col in numeric_cols],
          *[pl.col(col).min().alias(f"{col}_min") for col in numeric_cols],
          *[pl.col(col).max().alias(f"{col}_max") for col in numeric_cols],
          *[pl.col(col).std().alias(f"{col}_std") for col in numeric_cols]
      ])
)
print(grouped_page_post.head(3))


✅ Dataset loaded. Shape: (19009, 53)

=== Overall Descriptive Statistics (numeric) ===
shape: (9, 54)
┌────────────┬──────────────┬──────────────┬──────────────┬───┬──────────────┬─────────────┬─────────────┬─────────────┐
│ statistic  ┆ Facebook_Id  ┆ post_id      ┆ Page         ┆ … ┆ scam_illumin ┆ freefair_il ┆ fraud_illum ┆ Post        │
│ ---        ┆ ---          ┆ ---          ┆ Category     ┆   ┆ ating        ┆ luminating  ┆ inating     ┆ Created     │
│ str        ┆ str          ┆ str          ┆ ---          ┆   ┆ ---          ┆ ---         ┆ ---         ┆ Timestamp   │
│            ┆              ┆              ┆ str          ┆   ┆ f64          ┆ f64         ┆ f64         ┆ ---         │
│            ┆              ┆              ┆              ┆   ┆              ┆             ┆             ┆ str         │
╞════════════╪══════════════╪══════════════╪══════════════╪═══╪══════════════╪═════════════╪═════════════╪═════════════╡
│ count      ┆ 19009        ┆ 19009        ┆ 19009 

ColumnNotFoundError: "counts" not found

In [12]:
pip install polars

Note: you may need to restart the kernel to use updated packages.


In [1]:
import polars as pl

# Load the cleaned dataset
df = pl.read_csv("fb_posts_cleaned.csv")
print("Dataset loaded. Shape:", df.shape)

# STEP 1: Descriptive Statistics for numeric columns
print("\n=== Descriptive Statistics (Numeric) ===")
print(df.describe())

# STEP 2: Unique value counts per column
print("\n=== Unique Value Counts per Column ===")
for col in df.columns:
    print(f"{col}: {df[col].n_unique()}")

# STEP 3: Most Frequent Non-Numeric Values
print("\n=== Most Frequent Non-Numeric Values ===")
non_numeric_cols = [col for col, dtype in df.schema.items() if dtype == pl.Utf8]
for col in non_numeric_cols:
    vc_df = df.select(pl.col(col).value_counts())
    count_col = [c for c in vc_df.columns if c != col][0]
    vc_sorted = vc_df.sort(count_col, descending=True).limit(1)
    print(f"{col}:")
    print(vc_sorted)

# STEP 4: Grouped stats by Facebook_Id
numeric_cols = [col for col, dtype in df.schema.items() if dtype in [pl.Float64, pl.Float32, pl.Int64, pl.Int32]]
grouped_page = (
    df.group_by("Facebook_Id")
    .agg([
        *[pl.col(col).count().alias(f"{col}_count") for col in numeric_cols],
        *[pl.col(col).mean().alias(f"{col}_mean") for col in numeric_cols],
        *[pl.col(col).min().alias(f"{col}_min") for col in numeric_cols],
        *[pl.col(col).max().alias(f"{col}_max") for col in numeric_cols],
        *[pl.col(col).std().alias(f"{col}_std") for col in numeric_cols],
    ])
)
print("\n=== Grouped Stats by Facebook_Id ===")
print(grouped_page.head(3))

# STEP 5: Grouped stats by Facebook_Id and post_id
grouped_page_post = (
    df.group_by(["Facebook_Id", "post_id"])
    .agg([
        *[pl.col(col).count().alias(f"{col}_count") for col in numeric_cols],
        *[pl.col(col).mean().alias(f"{col}_mean") for col in numeric_cols],
        *[pl.col(col).min().alias(f"{col}_min") for col in numeric_cols],
        *[pl.col(col).max().alias(f"{col}_max") for col in numeric_cols],
        *[pl.col(col).std().alias(f"{col}_std") for col in numeric_cols],
    ])
)
print("\n=== Grouped Stats by Facebook_Id and post_id ===")
print(grouped_page_post.head(3))


✅ Dataset loaded. Shape: (19009, 53)

=== Descriptive Statistics (Numeric) ===
shape: (9, 54)
┌────────────┬──────────────┬──────────────┬──────────────┬───┬──────────────┬─────────────┬─────────────┬─────────────┐
│ statistic  ┆ Facebook_Id  ┆ post_id      ┆ Page         ┆ … ┆ scam_illumin ┆ freefair_il ┆ fraud_illum ┆ Post        │
│ ---        ┆ ---          ┆ ---          ┆ Category     ┆   ┆ ating        ┆ luminating  ┆ inating     ┆ Created     │
│ str        ┆ str          ┆ str          ┆ ---          ┆   ┆ ---          ┆ ---         ┆ ---         ┆ Timestamp   │
│            ┆              ┆              ┆ str          ┆   ┆ f64          ┆ f64         ┆ f64         ┆ ---         │
│            ┆              ┆              ┆              ┆   ┆              ┆             ┆             ┆ str         │
╞════════════╪══════════════╪══════════════╪══════════════╪═══╪══════════════╪═════════════╪═════════════╪═════════════╡
│ count      ┆ 19009        ┆ 19009        ┆ 19009        ┆

IndexError: list index out of range

In [5]:
import polars as pl

#STEP 1: Load the cleaned dataset
df = pl.read_csv("fb_posts_cleaned.csv")
print("Dataset loaded. Shape:", df.shape)

#STEP 2: Descriptive Statistics for numeric columns
print("\n=== Descriptive Statistics (Numeric Columns) ===")
print(df.describe())

#STEP 3: Unique value counts per column
print("\n=== Unique Value Counts per Column ===")
for col in df.columns:
    print(f"{col}: {df[col].n_unique()}")

#STEP 4: Most Frequent Non-Numeric Values
print("\n=== Most Frequent Non-Numeric Values ===")
non_numeric_cols = [col for col, dtype in df.schema.items() if dtype == pl.Utf8]
for col in non_numeric_cols:
    try:
        vc_df = df.select(pl.col(col).value_counts())
        # Check if result has more than 1 column and at least 1 row
        if len(vc_df.columns) < 2 or vc_df.is_empty():
            print(f"{col}: No value counts available")
            continue
        count_col = [c for c in vc_df.columns if c != col][0]
        vc_sorted = vc_df.sort(count_col, descending=True).limit(1)
        print(f"{col}:")
        print(vc_sorted)
    except Exception as e:
        print(f"{col}: Error - {e}")

#STEP 5: Grouped stats by Facebook_Id
numeric_cols = [col for col, dtype in df.schema.items() if dtype in [pl.Float64, pl.Float32, pl.Int64, pl.Int32]]
grouped_page = (
    df.group_by("Facebook_Id")
    .agg([
        *[pl.col(col).count().alias(f"{col}_count") for col in numeric_cols],
        *[pl.col(col).mean().alias(f"{col}_mean") for col in numeric_cols],
        *[pl.col(col).min().alias(f"{col}_min") for col in numeric_cols],
        *[pl.col(col).max().alias(f"{col}_max") for col in numeric_cols],
        *[pl.col(col).std().alias(f"{col}_std") for col in numeric_cols],
    ])
)
print("\n=== Grouped Stats by Facebook_Id ===")
print(grouped_page.head(3))

#STEP 6: Grouped stats by Facebook_Id and post_id
grouped_page_post = (
    df.group_by(["Facebook_Id", "post_id"])
    .agg([
        *[pl.col(col).count().alias(f"{col}_count") for col in numeric_cols],
        *[pl.col(col).mean().alias(f"{col}_mean") for col in numeric_cols],
        *[pl.col(col).min().alias(f"{col}_min") for col in numeric_cols],
        *[pl.col(col).max().alias(f"{col}_max") for col in numeric_cols],
        *[pl.col(col).std().alias(f"{col}_std") for col in numeric_cols],
    ])
)
print("\n=== Grouped Stats by Facebook_Id and post_id ===")
print(grouped_page_post.head(3))


Dataset loaded. Shape: (19009, 53)

=== Descriptive Statistics (Numeric Columns) ===
shape: (9, 54)
┌────────────┬──────────────┬──────────────┬──────────────┬───┬──────────────┬─────────────┬─────────────┬─────────────┐
│ statistic  ┆ Facebook_Id  ┆ post_id      ┆ Page         ┆ … ┆ scam_illumin ┆ freefair_il ┆ fraud_illum ┆ Post        │
│ ---        ┆ ---          ┆ ---          ┆ Category     ┆   ┆ ating        ┆ luminating  ┆ inating     ┆ Created     │
│ str        ┆ str          ┆ str          ┆ ---          ┆   ┆ ---          ┆ ---         ┆ ---         ┆ Timestamp   │
│            ┆              ┆              ┆ str          ┆   ┆ f64          ┆ f64         ┆ f64         ┆ ---         │
│            ┆              ┆              ┆              ┆   ┆              ┆             ┆             ┆ str         │
╞════════════╪══════════════╪══════════════╪══════════════╪═══╪══════════════╪═════════════╪═════════════╪═════════════╡
│ count      ┆ 19009        ┆ 19009        ┆ 19009   