In [None]:
%pip install seaborn

In [None]:
#You can create a DataFrame from a list of lists, where each inner list represents a row of data:
import pandas as pd

data = [
    [1, 'Alice', 25],
    [2, 'Bob', 30],
    [3, 'Charlie', 22],
    [4, 'David', 28]
]

columns = ['ID', 'Name', 'Age']

df = pd.DataFrame(data, columns=columns)
print(df)

In [None]:
#You can also create a DataFrame from a dictionary where keys 
#represent column names, and values represent lists of data:

import pandas as pd

data = {
    'ID': [1, 2, 3, 4],
    'Name': ['Alice', 'Bob', 'Charlie', 'David'],
    'Age': [25, 30, 22, 28]
}

df = pd.DataFrame(data)
print(df)

In [None]:
#You can create an empty DataFrame and then add data to it:
import pandas as pd

df = pd.DataFrame(columns=['ID', 'Name', 'Age'])
df.loc[0] = [1, 'Alice', 25]
df.loc[1] = [2, 'Bob', 30]
df.loc[2] = [3, 'Charlie', 22]
df.loc[3] = [4, 'David', 28]
print(df)

In [None]:
#You can also specify an index for your DataFrame:
import pandas as pd

data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David'],
    'Age': [25, 30, 22, 28]
}

df = pd.DataFrame(data, index=['A', 'B', 'C', 'D'])
print(df)

In [None]:
#Creating a DataFrame with DateTime Index:
import pandas as pd
import datetime

dates = [datetime.date(2023, 1, 1), datetime.date(2023, 1, 2), datetime.date(2023, 1, 3)]
data = {'Temperature': [32, 35, 30], 'Precipitation': [0.1, 0.0, 0.2]}

df = pd.DataFrame(data, index=dates)
print(df)

In [None]:
#Creating a DataFrame from External Data (e.g., CSV):
import pandas as pd

# Assuming 'data.csv' contains the data
df = pd.read_csv('processed_data.csv')
print(df)

In [None]:
import pandas as pd

# Sample data
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Emily'],
    'Age': [25, 30, 22, 28, 24],
    'Gender': ['Female', 'Male', 'Male', 'Male', 'Female']
}

# Create a DataFrame from the data
df = pd.DataFrame(data)

# Display the first few rows of the DataFrame
print("Initial DataFrame:")
print(df.head())

# Summary statistics
print("\nSummary Statistics:")
print(df.describe())

# Filtering rows based on a condition
filtered_df = df[df['Age'] > 25]
print("\nFiltered DataFrame:")
print(filtered_df)

# Adding a new column
df['Age_Group'] = df['Age'].apply(lambda age: 'Young' if age <= 25 else 'Old')
print("\nDataFrame with New Column:")
print(df)

# Grouping and aggregation
gender_grouped = df.groupby('Gender')['Age'].mean()
print("\nGrouped and Aggregated Data:")
print(gender_grouped)

# Sorting by Age in descending order
sorted_df = df.sort_values('Age', ascending=False)
print("\nSorted DataFrame:")
print(sorted_df)

# Exporting to CSV
df.to_csv('processed_data.csv', index=False)
print("\nDataFrame exported to CSV.")


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Sample dataset
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Emily'],
    'Age': [25, 30, None, 28, 24],
    'Gender': ['Female', 'Male', 'Male', 'Male', 'Female'],
    'Math_Score': [85, 92, 78, 88, 76],
    'Science_Score': [90, None, 85, 92, 88],
    'Passed_Exam': ['Yes', 'Yes', 'No', 'Yes', 'No']
}

# Create a DataFrame
df = pd.DataFrame(data)
print(df, "\n")

# Step 1: Handling Missing Values
# Fill missing values in 'Age' with the mean age
df['Age'].fillna(df['Age'].mean(), inplace=True)
print(df, "\n")

# Drop rows with missing values in other columns
df.dropna(subset=['Science_Score'], inplace=True)
print(df, "\n")


# Step 2: Encoding Categorical Variables
# Convert 'Gender' and 'Passed_Exam' to numeric using Label Encoding
label_encoder = LabelEncoder()
df['Gender'] = label_encoder.fit_transform(df['Gender'])
df['Passed_Exam'] = label_encoder.fit_transform(df['Passed_Exam'])
print(df, "\n")


# Display the preprocessed data
print("Preprocessed DataFrame:")
print(df, "\n")


In [None]:
#Handling DateTime Data:
import pandas as pd

# Sample dataset with a datetime column
data = {
    'Date': ['2023-01-15', '2023-02-20', '2023-03-25'],
    'Sales': [100, 150, 200]
}

df = pd.DataFrame(data)
print(df, "\n")

# Convert the 'Date' column to datetime type
df['Date'] = pd.to_datetime(df['Date'])

# Extract year, month, and day into separate columns
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day

print(df)


In [None]:
#Dealing with Text Data
import pandas as pd

# Sample dataset with a text column
data = {
    'Text': ['Hello, World!', 'How are you?', '   Pandas is great!   ']
}

df = pd.DataFrame(data)
print(df, "\n")

# Remove leading and trailing whitespace
df['Text'] = df['Text'].str.strip()
print(df, "\n")

# Convert text to lowercase
df['Text'] = df['Text'].str.lower()
print(df, "\n")

# Remove punctuation
df['Text'] = df['Text'].str.replace('[^\w\s]', '')
print(df, "\n")




In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the Iris dataset from Seaborn
iris = sns.load_dataset("iris")

# Display the first few rows of the DataFrame
print("First few rows of the Iris dataset:")
print(iris.head())

# Basic data exploration
print("\nSummary Statistics:")
print(iris.describe())

# Grouping and aggregation (e.g., average petal length by species)
species_grouped = iris.groupby('species')['petal_length'].mean()
print("\nAverage Petal Length by Species:")
print(species_grouped)

# Data visualization
# Scatter plot of sepal length vs. sepal width, colored by species
sns.scatterplot(x='sepal_length', y='sepal_width', hue='species', data=iris)
plt.title("Scatter Plot of Sepal Length vs. Sepal Width")
plt.show()

# Box plot of petal length by species
sns.boxplot(x='species', y='petal_length', data=iris)
plt.title("Box Plot of Petal Length by Species")
plt.show()

# Histogram of sepal length by species
sns.histplot(data=iris, x='sepal_length', hue='species', element="step", common_norm=False)
plt.title("Histogram of Sepal Length by Species")
plt.show()

# Exporting the DataFrame to a CSV file
iris.to_csv('iris_data.csv', index=False)
print("\nDataFrame exported to CSV.")
