In [59]:
import pandas as pd

In [60]:
# Reading data from a CSV file
df = pd.read_csv('sample_data.csv')
print(df.head())  # Display the first 5 rows of the DataFrame



   Product_ID     Category  Price  Quantity
0           1  Electronics    899        10
1           2     Clothing    799         5
2           3  Electronics    299         2
3           4    Furniture    549         3
4           5     Clothing    499         4


In [61]:
print(df.tail())  # Display the last 5 rows of the DataFrame


   Product_ID   Category  Price  Quantity
3           4  Furniture    549         3
4           5   Clothing    499         4
5           6       Home    399         3
6           7       Home    199         2
7           8       Home    299         6


In [62]:


# Generate descriptive statistics
summary = df.describe()
print(summary)

       Product_ID       Price  Quantity
count     8.00000    8.000000   8.00000
mean      4.50000  492.750000   4.37500
std       2.44949  248.477507   2.66927
min       1.00000  199.000000   2.00000
25%       2.75000  299.000000   2.75000
50%       4.50000  449.000000   3.50000
75%       6.25000  611.500000   5.25000
max       8.00000  899.000000  10.00000


In [63]:
# Group by a column and calculate the mean of other columns
grouped_df = df.groupby('Category').mean()
print(grouped_df)


             Product_ID  Price  Quantity
Category                                
Clothing            3.5  649.0  4.500000
Electronics         2.0  599.0  6.000000
Furniture           4.0  549.0  3.000000
Home                7.0  299.0  3.666667


In [64]:


# Replace NaN values with 0
df_filled = df.fillna(0)
print(df_filled)

   Product_ID     Category  Price  Quantity
0           1  Electronics    899        10
1           2     Clothing    799         5
2           3  Electronics    299         2
3           4    Furniture    549         3
4           5     Clothing    499         4
5           6         Home    399         3
6           7         Home    199         2
7           8         Home    299         6


In [65]:
# Apply a lambda function to create a new column based on existing columns
df['Discounted_Price'] = df['Price'].apply(lambda x: x * 0.9)  # 10% discount
print(df.head())

   Product_ID     Category  Price  Quantity  Discounted_Price
0           1  Electronics    899        10             809.1
1           2     Clothing    799         5             719.1
2           3  Electronics    299         2             269.1
3           4    Furniture    549         3             494.1
4           5     Clothing    499         4             449.1


In [66]:
# Drop a column
df_dropped = df.drop(columns=['Discounted_Price'])  # Drop the previously created column
print(df_dropped.head())


   Product_ID     Category  Price  Quantity
0           1  Electronics    899        10
1           2     Clothing    799         5
2           3  Electronics    299         2
3           4    Furniture    549         3
4           5     Clothing    499         4


In [67]:

# Drop a row by index
df_dropped_row = df.drop(index=[0])  # Drop the first row
print(df_dropped_row.head())


   Product_ID     Category  Price  Quantity  Discounted_Price
1           2     Clothing    799         5             719.1
2           3  Electronics    299         2             269.1
3           4    Furniture    549         3             494.1
4           5     Clothing    499         4             449.1
5           6         Home    399         3             359.1


In [68]:

# Sort the DataFrame by 'Price' in ascending order
sorted_df = df.sort_values(by='Price')
print(sorted_df.head())


   Product_ID     Category  Price  Quantity  Discounted_Price
6           7         Home    199         2             179.1
2           3  Electronics    299         2             269.1
7           8         Home    299         6             269.1
5           6         Home    399         3             359.1
4           5     Clothing    499         4             449.1


In [69]:

# Sort by 'Price' in descending order
sorted_desc_df = df.sort_values(by='Price', ascending=False)
print(sorted_desc_df.head())


   Product_ID     Category  Price  Quantity  Discounted_Price
0           1  Electronics    899        10             809.1
1           2     Clothing    799         5             719.1
3           4    Furniture    549         3             494.1
4           5     Clothing    499         4             449.1
5           6         Home    399         3             359.1


In [70]:

# Create another DataFrame to merge
df2 = pd.DataFrame({
    'Product_ID': [1, 2, 3, 4],
    'Supplier': ['A', 'B', 'C', 'D']
})

In [71]:

# Merge DataFrames on 'Product_ID'
merged_df = pd.merge(df, df2, on='Product_ID')
print(merged_df.head())


   Product_ID     Category  Price  Quantity  Discounted_Price Supplier
0           1  Electronics    899        10             809.1        A
1           2     Clothing    799         5             719.1        B
2           3  Electronics    299         2             269.1        C
3           4    Furniture    549         3             494.1        D


In [72]:


# Create a pivot table to calculate the mean 'Price' for each 'Category'
pivot = df.pivot_table(values='Price', index='Category', aggfunc='mean')
print(pivot)



             Price
Category          
Clothing     649.0
Electronics  599.0
Furniture    549.0
Home         299.0


In [73]:


# Check for missing values in the DataFrame
missing_values = df.isnull()
print(missing_values.head())


   Product_ID  Category  Price  Quantity  Discounted_Price
0       False     False  False     False             False
1       False     False  False     False             False
2       False     False  False     False             False
3       False     False  False     False             False
4       False     False  False     False             False


In [74]:


# Check for missing values in any column
missing_in_any_column = df.isnull().any()
print(missing_in_any_column)



Product_ID          False
Category            False
Price               False
Quantity            False
Discounted_Price    False
dtype: bool


In [75]:


# Select the first row
first_row = df.iloc[0]
print(first_row)


Product_ID                    1
Category            Electronics
Price                       899
Quantity                     10
Discounted_Price          809.1
Name: 0, dtype: object


In [76]:


# Select the first three rows and specific columns
subset = df.iloc[:3, [0, 2]]  # First 3 rows, columns 0 and 2
print(subset)


   Product_ID  Price
0           1    899
1           2    799
2           3    299


In [77]:

# Count unique values in the 'Category' column
category_counts = df['Category'].value_counts()
print(category_counts)



Category
Home           3
Electronics    2
Clothing       2
Furniture      1
Name: count, dtype: int64


In [78]:


# Check for duplicate rows
duplicates = df.duplicated()
print(duplicates)

0    False
1    False
2    False
3    False
4    False
5    False
6    False
7    False
dtype: bool


In [79]:

# Drop duplicate rows
df_no_duplicates = df.drop_duplicates()
print(df_no_duplicates)


   Product_ID     Category  Price  Quantity  Discounted_Price
0           1  Electronics    899        10             809.1
1           2     Clothing    799         5             719.1
2           3  Electronics    299         2             269.1
3           4    Furniture    549         3             494.1
4           5     Clothing    499         4             449.1
5           6         Home    399         3             359.1
6           7         Home    199         2             179.1
7           8         Home    299         6             269.1


In [80]:

# Convert the 'Price' column to integer type
df['Price'] = df['Price'].astype(int)
print(df.dtypes)


Product_ID            int64
Category             object
Price                 int64
Quantity              int64
Discounted_Price    float64
dtype: object


In [81]:

# Convert 'Product_ID' to string type
df['Product_ID'] = df['Product_ID'].astype(str)
print(df.dtypes)


Product_ID           object
Category             object
Price                 int64
Quantity              int64
Discounted_Price    float64
dtype: object


In [82]:

# Apply a function to every element in the DataFrame (e.g., multiplying all values by 2)
df_numeric = df[['Price']]  # Subset of numeric data
df_multiplied = df_numeric.applymap(lambda x: x * 2)
print(df_multiplied)


   Price
0   1798
1   1598
2    598
3   1098
4    998
5    798
6    398
7    598


  df_multiplied = df_numeric.applymap(lambda x: x * 2)


In [83]:

# Create two DataFrames
df1 = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
df2 = pd.DataFrame({'A': [7, 8, 9], 'B': [10, 11, 12]})


In [84]:

# Concatenate DataFrames along rows
df_concat = pd.concat([df1, df2], axis=0)
print(df_concat)


   A   B
0  1   4
1  2   5
2  3   6
0  7  10
1  8  11
2  9  12


In [85]:

# Concatenate DataFrames along columns
df_concat_columns = pd.concat([df1, df2], axis=1)
print(df_concat_columns)



   A  B  A   B
0  1  4  7  10
1  2  5  8  11
2  3  6  9  12


In [86]:

# Rename columns
df_renamed = df.rename(columns={'Price': 'Cost', 'Category': 'Product_Type'})
print(df_renamed.head())


  Product_ID Product_Type  Cost  Quantity  Discounted_Price
0          1  Electronics   899        10             809.1
1          2     Clothing   799         5             719.1
2          3  Electronics   299         2             269.1
3          4    Furniture   549         3             494.1
4          5     Clothing   499         4             449.1


In [87]:

# Rename index
df_renamed_index = df.rename(index={0: 'First', 1: 'Second'})
print(df_renamed_index.head())


       Product_ID     Category  Price  Quantity  Discounted_Price
First           1  Electronics    899        10             809.1
Second          2     Clothing    799         5             719.1
2               3  Electronics    299         2             269.1
3               4    Furniture    549         3             494.1
4               5     Clothing    499         4             449.1


In [88]:


# Create bins for the 'Price' column
price_bins = pd.cut(df['Price'], bins=[0, 50, 100, 200], labels=['Low', 'Medium', 'High'])
df['Price_Category'] = price_bins
print(df[['Price', 'Price_Category']])


   Price Price_Category
0    899            NaN
1    799            NaN
2    299            NaN
3    549            NaN
4    499            NaN
5    399            NaN
6    199           High
7    299            NaN


In [89]:
# Compute the correlation matrix for numerical columns
# Select only numeric columns
numeric_df = df.select_dtypes(include=['int64', 'float64'])

# Calculate correlation
correlation = numeric_df.corr()

# Display correlation matrix
print(correlation)

# correlation_matrix = df.corr()
# print(correlation_matrix)


                    Price  Quantity  Discounted_Price
Price             1.00000   0.72559           1.00000
Quantity          0.72559   1.00000           0.72559
Discounted_Price  1.00000   0.72559           1.00000


In [90]:

# Reset the index
df_reset = df.reset_index(drop=True)  # Drop the old index
print(df_reset.head())


  Product_ID     Category  Price  Quantity  Discounted_Price Price_Category
0          1  Electronics    899        10             809.1            NaN
1          2     Clothing    799         5             719.1            NaN
2          3  Electronics    299         2             269.1            NaN
3          4    Furniture    549         3             494.1            NaN
4          5     Clothing    499         4             449.1            NaN


In [91]:

# Drop rows with any NaN values
df_cleaned = df.dropna()
print(df_cleaned.head())

  Product_ID Category  Price  Quantity  Discounted_Price Price_Category
6          7     Home    199         2             179.1           High


In [92]:


# Drop columns with any NaN values
df_cleaned_columns = df.dropna(axis=1)
print(df_cleaned_columns.head())



  Product_ID     Category  Price  Quantity  Discounted_Price
0          1  Electronics    899        10             809.1
1          2     Clothing    799         5             719.1
2          3  Electronics    299         2             269.1
3          4    Furniture    549         3             494.1
4          5     Clothing    499         4             449.1


In [93]:

# Randomly select 3 rows from the DataFrame
df_sample = df.sample(n=3)
print(df_sample)

  Product_ID   Category  Price  Quantity  Discounted_Price Price_Category
1          2   Clothing    799         5             719.1            NaN
4          5   Clothing    499         4             449.1            NaN
3          4  Furniture    549         3             494.1            NaN


In [94]:

# Randomly select 20% of the DataFrame rows
df_sample_fraction = df.sample(frac=0.2)
print(df_sample_fraction)

  Product_ID  Category  Price  Quantity  Discounted_Price Price_Category
5          6      Home    399         3             359.1            NaN
1          2  Clothing    799         5             719.1            NaN


In [95]:

# Count unique values in each column
unique_counts = df.nunique()
print(unique_counts)

Product_ID          8
Category            4
Price               7
Quantity            6
Discounted_Price    7
Price_Category      1
dtype: int64


In [96]:

# Count unique values in a specific column
unique_categories = df['Category'].nunique()
print(unique_categories)


4


In [97]:


# Shift the 'Price' column down by 1 row
df['Shifted_Price'] = df['Price'].shift(1)
print(df[['Price', 'Shifted_Price']].head())


   Price  Shifted_Price
0    899            NaN
1    799          899.0
2    299          799.0
3    549          299.0
4    499          549.0


In [98]:


# Shift the 'Price' column up by 1 row
df['Shifted_Up_Price'] = df['Price'].shift(-1)
print(df[['Price', 'Shifted_Up_Price']].head())

   Price  Shifted_Up_Price
0    899             799.0
1    799             299.0
2    299             549.0
3    549             499.0
4    499             399.0


In [99]:


# Calculate the rolling mean (moving average) of the 'Price' column with a window of 3
df['Rolling_Mean_Price'] = df['Price'].rolling(window=3).mean()
print(df[['Price', 'Rolling_Mean_Price']].head(10))


   Price  Rolling_Mean_Price
0    899                 NaN
1    799                 NaN
2    299          665.666667
3    549          549.000000
4    499          449.000000
5    399          482.333333
6    199          365.666667
7    299          299.000000


In [100]:


# Filter rows where 'Price' is greater than 50 and 'Category' is 'Electronics'
filtered_df = df.query("Price > 50 and Category == 'Electronics'")
print(filtered_df)



  Product_ID     Category  Price  Quantity  Discounted_Price Price_Category  \
0          1  Electronics    899        10             809.1            NaN   
2          3  Electronics    299         2             269.1            NaN   

   Shifted_Price  Shifted_Up_Price  Rolling_Mean_Price  
0            NaN             799.0                 NaN  
2          799.0             549.0          665.666667  


In [101]:

# Example DataFrame with a column of lists
df_ex = pd.DataFrame({'Category': ['A', 'B'], 'Items': [['item1', 'item2'], ['item3', 'item4']]})


In [102]:
# Explode the 'Items' column so each list element gets its own row
df_exploded = df_ex.explode('Items')
print(df_exploded)

  Category  Items
0        A  item1
0        A  item2
1        B  item3
1        B  item4


In [103]:

# Calculate the cumulative sum of the 'Price' column
df['Cumulative_Sum_Price'] = df['Price'].cumsum()
print(df[['Price', 'Cumulative_Sum_Price']].head())


   Price  Cumulative_Sum_Price
0    899                   899
1    799                  1698
2    299                  1997
3    549                  2546
4    499                  3045


In [104]:

# Example DataFrame
df_pivot = pd.DataFrame({'Date': ['2023-01-01', '2023-01-02', '2023-01-01'],
                         'Product': ['A', 'A', 'B'], 'Sales': [10, 20, 15]})


In [105]:

# Pivot the DataFrame based on 'Date' and 'Product'
df_pivoted = df_pivot.pivot(index='Date', columns='Product', values='Sales')
print(df_pivoted)

Product        A     B
Date                  
2023-01-01  10.0  15.0
2023-01-02  20.0   NaN


In [106]:

# Rank the 'Price' column (1 is the lowest, NaN values are ignored)
df['Price_Rank'] = df['Price'].rank()
print(df[['Price', 'Price_Rank']].head())


   Price  Price_Rank
0    899         8.0
1    799         7.0
2    299         2.5
3    549         6.0
4    499         5.0


In [107]:

# Example DataFrame in wide format
df_melt = pd.DataFrame({'Year': [2020, 2021], 'Product_A': [100, 150], 'Product_B': [200, 250]})

In [108]:
# Unpivot the DataFrame from wide to long format
df_long = pd.melt(df_melt, id_vars='Year', var_name='Product', value_name='Sales')
print(df_long)


   Year    Product  Sales
0  2020  Product_A    100
1  2021  Product_A    150
2  2020  Product_B    200
3  2021  Product_B    250


In [109]:


# Example: Encode the 'Category' column into integer labels
df['Category_Encoded'], category_labels = pd.factorize(df['Category'])
print(df[['Category', 'Category_Encoded']].head())
print("Labels: ", category_labels)


      Category  Category_Encoded
0  Electronics                 0
1     Clothing                 1
2  Electronics                 0
3    Furniture                 2
4     Clothing                 1
Labels:  Index(['Electronics', 'Clothing', 'Furniture', 'Home'], dtype='object')


In [110]:

# Replace all prices below 50 with NaN
df['Price_Adjusted'] = df['Price'].where(df['Price'] >= 50)
print(df[['Price', 'Price_Adjusted']].head())


   Price  Price_Adjusted
0    899             899
1    799             799
2    299             299
3    549             549
4    499             499


In [111]:


# Get the 3 largest values from the 'Price' column
top_prices = df.nlargest(3, 'Price')
print(top_prices)


  Product_ID     Category  Price  Quantity  Discounted_Price Price_Category  \
0          1  Electronics    899        10             809.1            NaN   
1          2     Clothing    799         5             719.1            NaN   
3          4    Furniture    549         3             494.1            NaN   

   Shifted_Price  Shifted_Up_Price  Rolling_Mean_Price  Cumulative_Sum_Price  \
0            NaN             799.0                 NaN                   899   
1          899.0             299.0                 NaN                  1698   
3          299.0             499.0               549.0                  2546   

   Price_Rank  Category_Encoded  Price_Adjusted  
0         8.0                 0             899  
1         7.0                 1             799  
3         6.0                 2             549  


In [112]:



# Get the 3 largest values from the 'Price' column
low_prices = df.nsmallest(3, 'Price')
print(top_prices)


  Product_ID     Category  Price  Quantity  Discounted_Price Price_Category  \
0          1  Electronics    899        10             809.1            NaN   
1          2     Clothing    799         5             719.1            NaN   
3          4    Furniture    549         3             494.1            NaN   

   Shifted_Price  Shifted_Up_Price  Rolling_Mean_Price  Cumulative_Sum_Price  \
0            NaN             799.0                 NaN                   899   
1          899.0             299.0                 NaN                  1698   
3          299.0             499.0               549.0                  2546   

   Price_Rank  Category_Encoded  Price_Adjusted  
0         8.0                 0             899  
1         7.0                 1             799  
3         6.0                 2             549  
