# Basic analysis Of data Using Pandas

Scrutinizing and checking data is a crucial step in any data analysis process. It involves examining the data for accuracy, consistency, and completeness before performing any analysis or building models. In Python, the pandas library provides a wide range of tools and techniques for data exploration and validation.

## Loading the Data

In [None]:
import pandas as pd # test

# Load data from a CSV file
df = pd.read_csv('your_dataset.csv')

# Display the first few rows of the dataset
print("First 5 rows of the dataset:\n", df.head())

## Getting Basic Information

In [None]:
# Get basic information about the DataFrame
print("\nDataFrame Information:\n")
df.info()

# Display basic statistics for numerical columns
print("\nSummary statistics for numerical columns:\n", df.describe())

# Display basic statistics for categorical columns
print("\nSummary statistics for categorical columns:\n", df.describe(include=['O']))

# Check the number of rows and columns
print("\nShape of the DataFrame:", df.shape)

# Display column names
print("\nColumn names:", df.columns.tolist())

## Checking for Missing Values

In [None]:
# Check for missing values
missing_values = df.isna().sum()
print("\nMissing values in each column:\n", missing_values)

# Visualize missing data with a heatmap (optional, requires seaborn)
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
sns.heatmap(df.isna(), cbar=False, cmap='viridis')
plt.title("Heatmap of Missing Values")
plt.show()

## Exploring Unique Values and Data Distribution

In [None]:
# Check for unique values in each column
unique_values = df.nunique()
print("\nUnique values in each column:\n", unique_values)

# Check the distribution of categorical columns
for col in df.select_dtypes(include=['object']).columns:
    print(f"\nValue counts for {col}:\n", df[col].value_counts())

# Check the distribution of numerical columns
df.hist(figsize=(12, 8), bins=30)
plt.suptitle("Histograms of Numerical Columns")
plt.show()

## Detecting Duplicates

In [None]:
# Check for duplicate rows
duplicate_rows = df[df.duplicated()]
print("\nNumber of duplicate rows:", len(duplicate_rows))
print("\nDuplicate rows:\n", duplicate_rows)

# Remove duplicate rows
df_no_duplicates = df.drop_duplicates()
print("\nDataFrame after removing duplicates:\n", df_no_duplicates.head())

## Checking for Outliers

In [None]:
# Boxplot to visualize outliers in numerical columns
plt.figure(figsize=(12, 8))
df.boxplot()
plt.title("Boxplot of Numerical Columns")
plt.show()

# Identifying outliers using the IQR method
def detect_outliers(col):
    Q1 = col.quantile(0.25)
    Q3 = col.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return col[(col < lower_bound) | (col > upper_bound)]

outliers = df.apply(detect_outliers)
print("\nOutliers in each column:\n", outliers)

## Checking Data Types

In [None]:
# Check data types of each column
print("\nData types of each column:\n", df.dtypes)

# Convert data types if necessary
# Example: Convert a column to datetime
df['DateColumn'] = pd.to_datetime(df['DateColumn'], errors='coerce')
print("\nData types after conversion:\n", df.dtypes)

## Exploring Relationships Between Variables

In [None]:
# Correlation matrix for numerical columns
correlation_matrix = df.corr()
print("\nCorrelation matrix:\n", correlation_matrix)

# Visualize the correlation matrix with a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title("Correlation Matrix Heatmap")
plt.show()

# Scatter plot for pairs of numerical variables
sns.pairplot(df)
plt.suptitle("Pairplot of Numerical Columns", y=1.02)
plt.show()

## Checking Data Consistency

In [None]:
# Check for invalid or inconsistent data in categorical columns
for col in df.select_dtypes(include=['object']).columns:
    print(f"\nUnique values in {col}:\n", df[col].unique())

# Example: Check for invalid dates (e.g., dates in the future)
invalid_dates = df[df['DateColumn'] > pd.Timestamp.today()]
print("\nRows with invalid dates:\n", invalid_dates)

# Example: Check if numeric columns have negative values where they shouldn't
negative_values = df[df['NumericColumn'] < 0]
print("\nRows with unexpected negative values:\n", negative_values)