**Pandas**

andas is a powerful and widely-used open-source Python library for data manipulation and analysis. It provides efficient data structures and operations for working with tabular data, time series data, and more.

In [1]:
import pandas as pd

# Load the CSV file
df = pd.read_csv('/content/data.csv')

# Extract the headings (column names)
headings = df.columns.tolist()

# Print the headings
print("Headings:", headings)

Headings: ['User ID', 'country', 'Gender', 'Age', 'salary', 'Purchased']


In [4]:
import pandas as pd

# Load only the header row from the CSV
df = pd.read_csv("/content/data.csv", nrows=0)

# Extract the headings
headings = df.columns.tolist()

# Print the headings
print("Headings:", headings)

Headings: ['User ID', 'country', 'Gender', 'Age', 'salary', 'Purchased']


In [6]:
import pandas as pd

# Sample DataFrame
df = pd.DataFrame({
    'OldName1': [1, 2, 3],
    'OldName2': [4, 5, 6],
    'OldName3': [7, 8, 9]
})

# Print original DataFrame
print("Original DataFrame:")
print(df)

# Rename all columns
df.columns = ['NewName1', 'NewName2', 'NewName3']

# Print updated DataFrame
print("\nDataFrame after renaming all columns:")
print(df)

Original DataFrame:
   OldName1  OldName2  OldName3
0         1         4         7
1         2         5         8
2         3         6         9

DataFrame after renaming all columns:
   NewName1  NewName2  NewName3
0         1         4         7
1         2         5         8
2         3         6         9


In [10]:
import pandas as pd
import numpy as np
from scipy import stats

# Load data from a CSV file
df = pd.read_csv('/content/data.csv')

# Inspect the data
print("Original DataFrame:")
print(df.head())
print("\nDataFrame Info:")
print(df.info())
print("\nStatistical Summary:")
print(df.describe())

# Handling Missing Values
print("\nMissing Values in Each Column:")
print(df.isnull().sum())

# Drop rows with any missing values (optional)
# df_cleaned = df.dropna()

# Fill missing values with a specific value (e.g., 0)
df.fillna(value=0, inplace=True)

# Fill missing values with the mean of the column for a specific column
# df['column_name'].fillna(df['column_name'].mean(), inplace=True)

# Removing Duplicates
print("\nNumber of Duplicate Rows:", df.duplicated().sum())
df_cleaned = df.drop_duplicates()

# Changing Data Types
# Convert a column to a specific data type (e.g., integer)
# df_cleaned['column_name'] = df_cleaned['column_name'].astype(int)

# Convert a string column to datetime
# df_cleaned['date_column'] = pd.to_datetime(df_cleaned['date_column'])

# Renaming Columns
df_cleaned.rename(columns={'OldName': 'NewName'}, inplace=True)

# Filtering Rows
# Filter rows where 'column_name' > threshold_value
# df_filtered = df_cleaned[df_cleaned['column_name'] > threshold_value]

# Standardizing Text Data
# Convert a text column to lowercase
df_cleaned['country'] = df_cleaned['country'].str.lower()
# Remove leading and trailing whitespace
df_cleaned['country'] = df_cleaned['country'].str.strip()
# Replace specific strings
# df_cleaned['text_column'] = df_cleaned['text_column'].str.replace('old_value', 'new_value')

# Binning Continuous Data
# Bin a continuous variable into categories
# bins = [0, 50, 100]
# labels = ['Low', 'High']
# df_cleaned['binned_column'] = pd.cut(df_cleaned['continuous_column'], bins=bins, labels=labels)

# Outlier Detection and Removal
df_cleaned = df_cleaned[(np.abs(stats.zscore(df_cleaned['Purchased'])) < 3)]

# Final Check
print("\nCleaned DataFrame Info:")
print(df_cleaned.info())
print("\nCleaned DataFrame:")
print(df_cleaned.head())

# Export the cleaned DataFrame to a new CSV file
df_cleaned.to_csv('cleaned_data.csv', index=False)


Original DataFrame:
    User ID country  Gender   Age   salary  Purchased
0  15624510   India    Male  19.0  19000.0          0
1  15810944     USA    Male  35.0      NaN          1
2  15668575  France  Female  26.0  43000.0          0
3  15603246     USA  Female   NaN  57000.0          0
4  15804002  France    Male  19.0  76000.0          0

DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   User ID    20 non-null     int64  
 1   country    19 non-null     object 
 2   Gender     20 non-null     object 
 3   Age        17 non-null     float64
 4   salary     19 non-null     float64
 5   Purchased  20 non-null     int64  
dtypes: float64(2), int64(2), object(2)
memory usage: 1.1+ KB
None

Statistical Summary:
            User ID        Age         salary  Purchased
count  2.000000e+01  17.000000      19.000000  20.000000
mean   1.56788