In [2]:
import pandas as pd
import sweetviz as sv

# Load the dataset
file_path = "Games.csv"  # Update with correct path if needed
df = pd.read_csv(file_path)

# ✅ 1️⃣ Get basic dataset information
num_rows, num_columns = df.shape
print(f"Total Rows: {num_rows}")
print(f"Total Columns: {num_columns}")

# ✅ 2️⃣ Identify attribute types
attribute_types = df.dtypes
print("\nColumn Data Types:")
print(attribute_types)

# ✅ 3️⃣ Identify numeric and categorical attributes
numeric_columns = df.select_dtypes(include=['number']).columns
categorical_columns = df.select_dtypes(exclude=['number']).columns

# ✅ 4️⃣ Compute summary statistics for numeric attributes
numeric_stats = df[numeric_columns].describe().T
numeric_stats.rename(columns={"50%": "median"}, inplace=True)
print("\nNumeric Attribute Statistics:")
print(numeric_stats)

# ✅ 5️⃣ Count unique values for categorical attributes
unique_counts = df[categorical_columns].nunique()
print("\nUnique Values for Categorical Attributes:")
print(unique_counts)

# ✅ 6️⃣ Find the top 3 most frequent values for categorical attributes
top_values = {col: df[col].value_counts().head(3) for col in categorical_columns}
print("\nTop 3 Most Frequent Values for Categorical Attributes:")
for col, values in top_values.items():
    print(f"\nColumn: {col}")
    print(values)

# ✅ 7️⃣ Compute missing value percentages
missing_percent = df.isnull().sum() / len(df) * 100
print("\nMissing Value Percentages:")
print(missing_percent)

# ✅ 8️⃣ Create a dataset profile table
profile_table = pd.DataFrame({
    "Data Type": df.dtypes,
    "Unique Values": df.nunique(),
    "Missing Values (%)": missing_percent
})

# Add statistics for numeric columns
for col in numeric_columns:
    profile_table.loc[col, ['Min', 'Max', 'Mean', 'Median', 'Std Dev']] = numeric_stats.loc[col, ['min', 'max', 'mean', 'median', 'std']]

# Add top 3 most frequent values for categorical attributes
for col in categorical_columns:
    top_vals = df[col].value_counts().head(3).index.tolist()
    profile_table.loc[col, 'Top 3 Values'] = ', '.join(map(str, top_vals))

# ✅ 9️⃣ Save the dataset profile table to a CSV file
profile_table.to_csv("dataset_profile_table.csv", index=True)
print("\n✅ Dataset profile table saved as 'dataset_profile_table.csv'")

# ✅ 🔟 Generate an Automated Data Profiling Report using SweetViz
report = sv.analyze(df)

# ✅ 11️⃣ Save the profiling report as an interactive HTML file
report_file_path = "sweetviz_report.html"
report.show_html(report_file_path)

# ✅ 12️⃣ Print a message for manual viewing
print("\n✅ SweetViz profiling report saved as 'sweetviz_report.html'")
print("➡ Open 'sweetviz_report.html' in your browser to view the interactive report.")


Total Rows: 16719
Total Columns: 16

Column Data Types:
Name                object
Platform            object
Year_of_Release    float64
Genre               object
Publisher           object
NA_Sales           float64
EU_Sales           float64
JP_Sales           float64
Other_Sales        float64
Global_Sales       float64
Critic_Score       float64
Critic_Count       float64
User_Score          object
User_Count         float64
Developer           object
Rating              object
dtype: object

Numeric Attribute Statistics:
                   count         mean         std      min      25%   median  \
Year_of_Release  16450.0  2006.487356    5.878995  1980.00  2003.00  2007.00   
NA_Sales         16719.0     0.263330    0.813514     0.00     0.00     0.08   
EU_Sales         16719.0     0.145025    0.503283     0.00     0.00     0.02   
JP_Sales         16719.0     0.077602    0.308818     0.00     0.00     0.00   
Other_Sales      16719.0     0.047332    0.186710     0.00     0.00

                                             |          | [  0%]   00:00 -> (? left)

Report sweetviz_report.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.

✅ SweetViz profiling report saved as 'sweetviz_report.html'
➡ Open 'sweetviz_report.html' in your browser to view the interactive report.
