# **Data Wrangling**

### Creating a Data Frame

In [None]:
import pandas as pd
dataframe = pd.DataFrame()

### Describing the Data

In [None]:
url = "/content/Titanic.csv"
df = pd.read_csv(url)
# show first two rows
print(df.head (2)) # also try tail (2) for last two rows.
# show dimensions
print("Dimensions: {}".format(df.shape))
# show statistics
df.describe()

### Navigating DataFrames

In [None]:
# select the first row
print(df.iloc[0])
# select three rows
print(df.iloc[1:4])
# all rows up to and including the fourth row
print(df.iloc[:4])

# Set index to a column (e.g., 'Name')
# Note: This assumes you have a unique column like 'Name'
df = df.set_index(df['Name'])

# Show row using the new index
print(df.loc["Wilkes, Mrs. James (Ellen Needs)"])

### Selecting Rows Based on Conditionals

In [None]:
# Select top two rows where column 'Sex' is 'female'
# Replace 'Sex' and 'female' with your column and value
print(df[df['Sex'] == 'female'].head(2))

# Multiple conditions
# Replace with your column names and values
print(df[(df['Sex'] == 'female') & (df['Age'] >= 65)])

### Replacing Values

In [None]:
# Replace any instance of 'female' with 'Woman' in the 'Sex' column
# Replace with your column, old value, and new value
print(df['Sex'].replace('female', 'Woman').head(2))

# Replace multiple values
print(df['Sex'].replace(['female', 'male'], ['Woman', 'Man']).head(5))

# Replace a value across the entire DataFrame
print(df.replace(1, "One").head(2))

### Renaming Columns

In [None]:
# Rename a single column
# Replace 'PClass' and 'Passenger Class' with your column names
print(df.rename(columns={'PClass': 'Passenger Class'}).head(2))

# Rename multiple columns
print(df.rename(columns={'PClass': 'Passenger Class', 'Sex': 'Gender'}).head(2))

### Finding the Min, Max, Sum, Average, and Count

In [None]:
# Replace 'Age' with your numerical column
print('Maximum: {}'.format(df['Age'].max()))
print('Minimum: {}'.format(df['Age'].min()))
print('Mean: {}'.format(df['Age'].mean()))
print('Sum: {}'.format(df['Age'].sum()))
print('Count: {}'.format(df['Age'].count()))

# Apply to the whole DataFrame (for numerical columns)
print("Variance: {}".format(df.var(numeric_only=True)))
print("Standard Deviation: {}".format(df.std(numeric_only=True)))
print("Kurtosis: {}".format(df.kurt(numeric_only=True)))
print("Skewness: {}".format(df.skew(numeric_only=True)))

### Finding Unique Values

In [None]:
# Replace 'Sex' with your column
# unique will return an array of all unique values in a column
print(df['Sex'].unique())

# value_counts will display all unique values with the number of times each value appears
print(df['Sex'].value_counts())

### Handling Missing Values

In [None]:
# Select missing values in the 'Age' column, show 2 rows
# Replace 'Age' with your column
print(df[df['Age'].isnull()].head(2))

### Deleting a Column




In [None]:
# axis=1 means the column axis
# Replace 'Age' with your column
print(df.drop('Age', axis=1).head(2))

### Deleting a Row

In [None]:
# Create new dataframe excluding rows where 'Sex' is 'male'
# Replace 'Sex' and 'male' with your column and value
print(df[df['Sex'] != 'male'].head(2))

# Delete a row by matching a unique value (e.g., in 'Name' column)
# Replace 'Name' and the value with yours
print(df[df['Name'] != 'Allison, Miss Helen Loraine'].head(2))

# Delete a row by index (e.g., index 0)
# Note: The original code 'df [df.index != 0].head(2)' seems to be a typo for demonstration.
# A correct way to show the result *without* index 0 would be:
print(df.iloc[1:].head(2)) # This shows rows starting from index 1

### Grouping Rows by Values

In [None]:
# Replace 'Sex' with your grouping column
print(df.groupby('Sex').mean(numeric_only=True))

# Replace 'Survived' and 'Name'
print(df.groupby('Survived')['Name'].count())

# Group by multiple columns
print(df.groupby(['Sex', 'Survived'])['Age'].mean())

### Looping Over a Column

In [None]:
# Replace 'Name' with your column
# for Loop
for name in df['Name'][0:2]:
    print(name.upper())

# List comprehension (more "pythonic")
print([name.upper() for name in df['Name'][0:2]])

### Applying a Function Over All Elements in a Column

In [None]:
# Define your function
def uppercase(x):
    # Add a check in case of non-string data
    if isinstance(x, str):
        return x.upper()
    return x

# Replace 'Name' with your column
print(df['Name'].apply(uppercase)[0:2])

### Applying a Function to Groups

In [None]:
# Replace 'Sex' with your grouping column
print(df.groupby('Sex').apply(lambda x: x.count()))