# Task 1: Setup and DataFrame Creation

##### 1.1 Install Pandas and other necessary libraries
*Install pandas*

In [25]:
# ! pip install pandas

*install numpy*

In [26]:
# ! pip install numpy

*install matplotlib*

In [27]:
# ! pip install matplotlib

*install seaborn*

In [28]:
# ! pip install seaborn

##### 1.2 Import Pandas

In [29]:
# import pandas as pd

##### 1.3 Creating DataFrame from dictionary

In [30]:
my_dict = {
    "Name": ["Anish", "Manish", "Sanish", "Vanish"],
    "Age": [30, 29, 25, 18],
    "Occupation": ["Manager","Teacher", "Banker", "Student"]
}
df = pd.DataFrame(my_dict)
df

NameError: name 'pd' is not defined

##### 1.4 Creating DataFrame from list of dictionaries

In [None]:
my_list_dict = [
    {"Name":"Anish", "Age": 30, "Occupation":"Manager"},
    {"Name":"Manish", "Age": 29, "Occupation":"Teacher"},
    {"Name":"Sanish", "Age": 25, "Occupation":"Banker"},
    {"Name":"Vanish", "Age": 18, "Occupation":"Student"}
]
df = pd.DataFrame(my_dict)
df

##### 1.5 Loading dataset from csv

In [None]:
df = pd.read_csv('messed_dataset.csv')
df.head()

# Task 2: Viewing and Inspecting Data

##### 2.1 Display first few rows of the DataFrame

In [None]:
# first 5 rows
df.head(5)

##### 2.2 Display last few rows of the DataFrame

In [None]:
# last 5 rows
df.tail(5)

##### 2.3 Getting Summary of the DataFrame

In [None]:
# Display datatypes 
df.info()

In [None]:
# Basic statistics
df.describe()

##### 2.4 Display shape and column name of the dataFrame

In [None]:
# getting shape
df.shape

In [None]:
# Column names
df.columns

# Task 3: Selection and Indexing


##### 3.1 Selecting columns

In [None]:
# selecting single column - eg: Name column
df["Name"]

In [None]:
# selecting multiple column - eg: Name and Age
df[["Name", "Age"]]

##### 3.2 Select rows by index and by label

In [None]:
# by index (using iloc)
df.iloc[0:5] #selects first five rows - 0,1,2,3,4

In [None]:
# by label (using loc)
df.loc[0:5] # selects first 6 rows 0, 1, 2, 3, 4, 5

##### 3.3 Select specific rows and columns using loc and iloc

In [None]:
# specific row and column using loc
df.loc[0:5, ["Name", "Age", "Occupation"]]

In [None]:
# specific row and column using iloc
df.iloc[0:5, [0, 1, 3]]

# Task 4: Handling Missing Data

##### 4.1 Identify missing values in the DataFrame

In [None]:
df1 = pd.read_csv('messed_dataset.csv')
df1.head()

In [None]:
# identifying missing data
df1.isnull().sum()

##### 4.2 Drop rows with missing values

In [None]:
df2 = df1.dropna()
df2.head()

In [None]:
df2.isnull().sum()

##### 4.3 Fill missing values with a specified value

In [None]:
df3 = df1.fillna(value = 
          {
            "Name" :            "Ram",
            "Age"   :            "30",
            "Email" :           "ram@example.com",
            "Country" :         "Nepal",
            "Salary"  :          "30000",
            "DateOfJoining":  "2020-07-01"
            })
df3.head()

In [None]:
df3.isnull().sum()

# Task 5: Data Operations

##### 5.1 Add a new column to the DataFrame

In [None]:
my_dict = {
    "Name": ["Anish", "Manish", "Sanish", "Vanish"],
    "Age": [30, 29, 25, 18],
    "Occupation": ["Manager","Teacher", "Banker", "Student"]
}
df = pd.DataFrame(my_dict)
df["Salaries"] = [35000, 21000, 24000, 0]
df["Favourite Fruit"] = ["Apple", "Banana", "Orange", "Grapes"]
df

##### 5.2 Delete a column from the DataFrame

In [None]:
df.drop(columns=["Favourite Fruit"], inplace=True)
df

##### 5.3 Rename columns in the DataFrame

In [None]:
df.rename(columns={"Salaries":"Salary"}, inplace=True)
df

##### 5.4 Apply a function to a column

In [None]:
# increase salary of everyone by 1000
df["Salary"] = df["Salary"].apply(lambda x : x + 1000)
df

# Task 6: GroupBy Operations

##### 6.1 Group the DataFrame by a column and calculate summary statistics

In [None]:
# grouping by Department
my_data = {
    'Employee': ["Ram", "Hari", "LAxman", "Sita", "Hanuman"],
    'Department': ['HR', 'Finance', 'OM', 'Credit', 'IT'],
    'Age': ["29", "30", "333", "35", "28"],
    'Salary': [45000, 50000,55000, 60000, 38000]
}
df = pd.DataFrame(my_data)
grouped = df.groupby("Department")
grouped

In [None]:
# mean and max salary of grouped data
print("Mean\n", grouped["Salary"].mean())
print("Max\n", grouped["Salary"].max())

##### 6.2 Iterate over groups and display the group names and data

In [None]:
for group_name, group_data in grouped:
    print(group_name)
    print(group_data)

# Task 7: Merging and Joining DataFrames

##### 7.1 Merge two DataFrames on a common column

In [None]:
my_data1 = {
    "Name": ["A", "B", "C"],
    "Math" : [89, 80, 66],
    "Science" : [70, 77,60]
}

df9 = pd.DataFrame(my_data1)
df9



In [None]:
my_data2 = {
    "Nepali": [70,66,68],
    "English": [77, 67, 70]
}
df10 = pd.DataFrame(my_data2)
df10

In [None]:
merged_df = df9.join(df10)
merged_df

##### 7.2 Join two DataFrames using their indices

In [None]:
my_data3 = {
    "Sn": [1, 2, 3],
    "Name": ["x", "y", "z"],
    "Math": [90, 83, 77],
    "Science": [76, 64, 78]
}
df11 = pd.DataFrame(my_data3).set_index("Sn")
df11

In [None]:
my_data4 = {
    "Sn": [1, 2, 3],
    "Nepali": [67, 55, 63],
    "English": [73, 72, 63]
}
df12 = pd.DataFrame(my_data4).set_index("Sn")
df12

In [None]:
merged_df2 = pd.merge(df11, df12, left_index=True, right_index=True)
merged_df2

# Task 8: Working with Dates and Times

##### 8.1 Create a datetime index for the DataFrame

In [None]:
my_data = {
   'Employee': ["Ram", "Hari", "LAxman", "Sita", "Hanuman"],
    'Department': ['HR', 'Finance', 'OM', 'Credit', 'IT'],
    'Age': ["29", "30", "333", "35", "28"],
    'Salary': [45000, 50000,55000, 60000, 38000]
}
df = pd.DataFrame(my_data)
df

In [None]:
# Adding Date column to df
df["Date"] = pd.date_range(start='1/1/2020', periods=len(df), freq="M").astype(str)
df

##### 8.2 Convert a column to datetime and extract date components

In [None]:
df.info()

In [None]:
# Covert datatype of Date from object to datetime in column Date

df["Date"] = pd.to_datetime(df["Date"])
df

In [None]:
# rename Date to DateTime
df.rename(columns={"Date": "DateTime"}, inplace=True)
df

In [None]:
df.info()

In [None]:
# extract date components year and month
df["Year"] = df["DateTime"].dt.year
df["Month"] = df["DateTime"].dt.month
df

# Task 9: Input and Output

##### 9.1 Read from a csv file into a DataFrame

In [None]:
df = pd.read_csv("messed_dataset.csv")
df.head()

##### 9.2 Write the DataFrame to a csv

In [None]:
# dropping the null row from the datasets
df.dropna(inplace=True)
print(df.isnull().sum())

In [None]:
# writing the new datasets to csv
df.to_csv("dropped_null_from_messed_dataset.csv", index=False)

##### 9.3 Read data from an Excel file into a DataFrame

In [None]:
df = pd.read_excel("SaleData.xlsx")
df.head()

In [None]:
df.isnull().sum()

##### 9.4 Write the DataFrame to an Excel file

In [None]:
# drop null rows formt he datasets
df.dropna(inplace=True)
df.isnull().sum()

In [None]:
# writing the new dataset to excel
df.to_excel("dropped_null_from_SaleData.xlsx", index=False)

# Data Visualization
- Visualize the data by using the visualization libraries


# Task 10: Visualization

##### 10.1 Create a simple plot using Matplotlib

In [None]:
import numpy as np
import matplotlib.pyplot as plt

x_axis = np.array([1, 2, 3, 4, 5])
y_axis = np.array([2, 4, 6, 8, 16])
plt.plot(x_axis, y_axis)
plt.xlabel("Marks")
plt.ylabel("frequency")
plt.title("Line chart in matplotlib")
plt.show()

##### 10.2 Create a bar plot using Seaborn

In [None]:
import seaborn as sns

x_axis = np.array(['English', 'Nepali', 'Chineese', 'Spanish',
                  'Ronam', 'Hindi', 'Russian', 'Polish', 'French'])
y_axis = np.random.random(9) 
sns.barplot(x=x_axis, y=y_axis)
plt.xlabel('Languages')
plt.ylabel('Population')
plt.title('Bar Plot using Seaborn')  
plt.show()