In [1]:
import pandas as pd


In [2]:
data = { 'Name': ['Alice', 'Bob', 'Charlie', 'David'], 'Age': [24, 25, 22, 23], 'City': ['New York', 'Los Angeles', 'Chicago', 'Houston']}
df = pd.DataFrame(data)

In [3]:
print(df.head()) # First few rows

      Name  Age         City
0    Alice   24     New York
1      Bob   25  Los Angeles
2  Charlie   22      Chicago
3    David   23      Houston


In [4]:
print(df.tail()) # last few rows

      Name  Age         City
0    Alice   24     New York
1      Bob   25  Los Angeles
2  Charlie   22      Chicago
3    David   23      Houston


In [5]:
print(df.info())  # Summary of the DataFrame

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    4 non-null      object
 1   Age     4 non-null      int64 
 2   City    4 non-null      object
dtypes: int64(1), object(2)
memory usage: 228.0+ bytes
None


In [6]:
print(df.describe()) # Statistical summary of numerical columns

             Age
count   4.000000
mean   23.500000
std     1.290994
min    22.000000
25%    22.750000
50%    23.500000
75%    24.250000
max    25.000000


In [7]:
print(df['Name']) # Selecting specific columns

0      Alice
1        Bob
2    Charlie
3      David
Name: Name, dtype: object


In [8]:
# Selecting specific rows using iloc
print(df.iloc[0:2])

    Name  Age         City
0  Alice   24     New York
1    Bob   25  Los Angeles


In [9]:
# Basic operations
df['Age'] =  df['Age'] + 1
print(df)

      Name  Age         City
0    Alice   25     New York
1      Bob   26  Los Angeles
2  Charlie   23      Chicago
3    David   24      Houston


In [10]:
# Exporting DataFrame to CSV
df.to_csv('output.csv', index=False)

In [11]:
# Exporting DataFrame to JSON
df.to_json('Output.Json')

In [12]:
# Exporting DataFrame to HTML
df.to_html('output.html')

In [13]:
# delete initial DataFrame
del df

# Identifying Missing Values

In [14]:
data = {'Name': ['Alice', 'Bob', 'Charlie', 'David'], 'Age': [24, None, 22, 23], 'City':['New York', 'Los Angeles', None, 'Chicago']}
df =pd.DataFrame(data)

In [15]:
df.head()

Unnamed: 0,Name,Age,City
0,Alice,24.0,New York
1,Bob,,Los Angeles
2,Charlie,22.0,
3,David,23.0,Chicago


In [16]:
# Detect missing values
print(df.isnull())

    Name    Age   City
0  False  False  False
1  False   True  False
2  False  False   True
3  False  False  False


In [17]:
# Fill missing values with a specific value
df_filled = df.fillna({'Age': df['Age'].mean(), 'City': 'Unknown'})
df_filled.head()


Unnamed: 0,Name,Age,City
0,Alice,24.0,New York
1,Bob,23.0,Los Angeles
2,Charlie,22.0,Unknown
3,David,23.0,Chicago


In [18]:
df.head()

Unnamed: 0,Name,Age,City
0,Alice,24.0,New York
1,Bob,,Los Angeles
2,Charlie,22.0,
3,David,23.0,Chicago


In [19]:
# drop rows withany missing values
df_dropped = df.dropna()
df_dropped.head()

Unnamed: 0,Name,Age,City
0,Alice,24.0,New York
3,David,23.0,Chicago


In [20]:
# Print the DataFrame that filled missing values with other values
print("DataFrame after filling missing values:\n", df_filled)


DataFrame after filling missing values:
       Name   Age         City
0    Alice  24.0     New York
1      Bob  23.0  Los Angeles
2  Charlie  22.0      Unknown
3    David  23.0      Chicago


In [21]:
# Printthe DataFrame that was dropped
print("DataFrame after dropping ros with missing values:\n", df_dropped)

DataFrame after dropping ros with missing values:
     Name   Age      City
0  Alice  24.0  New York
3  David  23.0   Chicago


In [22]:
del df

# Identifying and Removing Duplicates

In [35]:
data = {'Name': ['Alice', 'Bob', 'Charlie', 'Bob'], 'Age': [24, 25, 22, 25], 'City': ['New York', 'Los Angeles', 'Chicago', 'los Angeles']}
df = pd.DataFrame(data)

In [36]:
df.head()

Unnamed: 0,Name,Age,City
0,Alice,24,New York
1,Bob,25,Los Angeles
2,Charlie,22,Chicago
3,Bob,25,los Angeles


In [37]:
# Identify duplicates
duplicates = df.duplicated()
print("Duplicate rows:\n", duplicates)

Duplicate rows:
 0    False
1    False
2    False
3    False
dtype: bool


In [38]:
# Remove duplicates
df_no_duplicates = df.drop_duplicates
print("DataFrame after removing duplicates:\n", df_no_duplicates)

DataFrame after removing duplicates:
 <bound method DataFrame.drop_duplicates of       Name  Age         City
0    Alice   24     New York
1      Bob   25  Los Angeles
2  Charlie   22      Chicago
3      Bob   25  los Angeles>


In [39]:
df.head()

Unnamed: 0,Name,Age,City
0,Alice,24,New York
1,Bob,25,Los Angeles
2,Charlie,22,Chicago
3,Bob,25,los Angeles


In [40]:
del df

Correcting inconsistencies

In [41]:
# Example: Correcting data types and standardizing formats
data = {   'ID': ['001', '002', '003', '004'],   'Price': ['100.5', '200', '300.25', '400.75'],   'Date': ['2023-01-01', '2023/01/02', '01-03-2023', 'January 4, 2023']
}
df = pd.DataFrame(data)

In [42]:
df.head()

Unnamed: 0,ID,Price,Date
0,1,100.5,2023-01-01
1,2,200.0,2023/01/02
2,3,300.25,01-03-2023
3,4,400.75,"January 4, 2023"


In [43]:
# Convert Price to float
df['Price'] = df['Price'].astype(float)


In [50]:
# Standardize Date format
df['Date'] = pd.to_datetime(df['Date'], format='%Y/%m/%d')

Example Solutions:
If your date strings are consistently formatted as "YYYY/MM/DD", you could modify your code like this:

df['Date'] = pd.to_datetime(df['Date'], format='%Y/%m/%d')  
If you are unsure about the formats:

df['Date'] = pd.to_datetime(df['Date'], errors='coerce')  # This will convert non-matching formats to NaT  
Choose the approach that best fits your data.

In [49]:
print("DataFrame with corrected data types and standardized formats:\n", df)

DataFrame with corrected data types and standardized formats:
     ID   Price       Date
0  001  100.50 2023-01-01
1  002  200.00        NaT
2  003  300.25        NaT
3  004  400.75        NaT
