## We can use CSV or Excel file to store the data. And analyse the data using pandas library.

In [None]:
import pandas as pd

df = pd.read_csv('data.csv')

print(df.to_string()) 

## Also lets make a simple DataFrame

In [None]:
import pandas as pd

data = {
  "calories": [420, 380, 390],
  "duration": [50, 40, 45]
}

#load data into a DataFrame object:
df = pd.DataFrame(data)

print(df) 

   calories  duration
0       420        50
1       380        40
2       390        45


## Data Cleaning

In [None]:
import pandas as pd

# Sample data with empty cells
data = {
    'ID': [1, 2, 3],
    'Name': ['John Doe', 'Jane Doe', 'Jim Beam'],
    'Age': [28, None, 34],
    'Email': ['john@example.com', 'jane@example.com', None]
}

df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)

# Fill missing values in the 'Age' column with the mean age
df['Age'].fillna(df['Age'].mean(), inplace=True)

# Fill missing values in the 'Email' column with a placeholder
df['Email'].fillna('noemail@example.com', inplace=True)

print("\nDataFrame after filling missing values:")
print(df)


Original DataFrame:
   ID      Name   Age             Email
0   1  John Doe  28.0  john@example.com
1   2  Jane Doe   NaN  jane@example.com
2   3  Jim Beam  34.0              None

DataFrame after filling missing values:
   ID      Name   Age                Email
0   1  John Doe  28.0     john@example.com
1   2  Jane Doe  31.0     jane@example.com
2   3  Jim Beam  34.0  noemail@example.com


## Handeling Data in Wrong Format

In [None]:
import pandas as pd

# Sample data with dates in different formats
data = {
    'ID': [1, 2, 3],
    'Name': ['John Smith', 'Emily Davis', 'Michael Lee'],
    'Date of Birth': ['1985-12-05', '05/12/1987', '12/05/90']
}

df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)

# Standardize date format to YYYY-MM-DD
df['Date of Birth'] = pd.to_datetime(df['Date of Birth'], dayfirst=True).dt.strftime('%Y-%m-%d')

print("\nDataFrame after standardizing date format:")
print(df)


Original DataFrame:
   ID         Name Date of Birth
0   1   John Smith    1985-12-05
1   2  Emily Davis    05/12/1987
2   3  Michael Lee      12/05/90


ValueError: time data "05/12/1987" doesn't match format "%Y-%d-%m", at position 1. You might want to try:
    - passing `format` if your strings have a consistent format;
    - passing `format='ISO8601'` if your strings are all ISO8601 but not necessarily in exactly the same format;
    - passing `format='mixed'`, and the format will be inferred for each element individually. You might want to use `dayfirst` alongside this.

## Handling Wrong Data

In [None]:
import pandas as pd

# Sample data with wrong data (negative price)
data = {
    'ID': [1, 2, 3],
    'Product': ['Apple', 'Banana', 'Cherry'],
    'Price': [1.20, 0.50, -1.00]
}

df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)

# Define a function to correct negative prices
def correct_price(price):
    if price < 0:
        return abs(price)
    return price

# Apply the function to the 'Price' column
df['Price'] = df['Price'].apply(correct_price)

print("\nDataFrame after correcting wrong data:")
print(df)


## Drop Duplicate Rows

In [None]:
import pandas as pd

# Sample data with duplicates
data = {
    'ID': [1, 2, 3, 3],
    'Name': ['Anna Brown', 'Tom Green', 'Anna Brown', 'Anna Brown'],
    'Age': [30, 45, 30, 30]
}

df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)

# Removing duplicate rows
df.drop_duplicates(inplace=True)

print("\nDataFrame after removing duplicates:")
print(df)


## Viewing Data

In [None]:
# Importing pandas
import pandas as pd

# Creating DataFrames
# From a dictionary
data = {'Name': ['John', 'Jane', 'Jim'], 'Age': [28, 24, 31]}
df = pd.DataFrame(data)

# From a CSV file (assuming a file named 'file.csv' exists)
# df = pd.read_csv('file.csv')

In [None]:


# Viewing Data
# Display the first few rows
print("First few rows of the DataFrame:")
print(df.head())

# Display the last few rows
print("\nLast few rows of the DataFrame:")
print(df.tail())

# Get basic information about the DataFrame
print("\nBasic information about the DataFrame:")
print(df.info())

In [None]:
# Select a column
print("\nSelecting the 'Name' column:")
print(df['Name'])

# Select multiple columns
print("\nSelecting the 'Name' and 'Age' columns:")
print(df[['Name', 'Age']])

# Select rows by index
print("\nSelecting the first row by index:")
print(df.iloc[0])

print("\nSelecting the first two rows by index:")
print(df.iloc[0:2])

# Select rows by condition
print("\nSelecting rows where 'Age' is greater than 25:")
print(df[df['Age'] > 25])

In [None]:

# Modifying Data
# Add a new column
df['Score'] = [85, 90, 88]
print("\nDataFrame after adding a new column 'Score':")
print(df)

# Update values
df.loc[0, 'Age'] = 29  # Update specific cell
print("\nDataFrame after updating 'Age' of the first row:")
print(df)

df['Age'] = df['Age'] + 1  # Update entire column
print("\nDataFrame after incrementing 'Age' column by 1:")
print(df)

# Drop a column
df.drop('Score', axis=1, inplace=True)
print("\nDataFrame after dropping the 'Score' column:")
print(df)

# Drop rows
df.drop([0, 1], axis=0, inplace=True)
print("\nDataFrame after dropping the first two rows:")
print(df)

# Handling Missing Data
# Check for missing values
print("\nChecking for missing values:")
print(df.isnull().sum())

# Fill missing values (adding some missing data for demonstration)
df = pd.DataFrame({'Name': ['John', 'Jane', 'Jim'], 'Age': [28, None, 31], 'Email': [None, 'jane@example.com', 'jim@example.com']})
print("\nDataFrame with missing values:")
print(df)

df['Age'].fillna(df['Age'].mean(), inplace=True)
print("\nDataFrame after filling missing 'Age' values with mean:")
print(df)

df['Email'].fillna('noemail@example.com', inplace=True)
print("\nDataFrame after filling missing 'Email' values with placeholder:")
print(df)

# Drop rows with missing values
df.dropna(inplace=True)
print("\nDataFrame after dropping rows with missing values:")
print(df)

# Sorting Data
df = pd.DataFrame({'Name': ['John', 'Jane', 'Jim'], 'Age': [28, 24, 31]})
df.sort_values('Age', ascending=False, inplace=True)
print("\nDataFrame after sorting by 'Age' in descending order:")
print(df)

# Grouping Data
df = pd.DataFrame({'Name': ['Anna', 'Anna', 'Bob'], 'Age': [30, 25, 30], 'Score': [85, 90, 88]})
grouped_df = df.groupby('Name').mean()
print("\nDataFrame after grouping by 'Name' and calculating the mean:")
print(grouped_df)

# Merging DataFrames
df1 = pd.DataFrame({'key': ['A', 'B', 'C'], 'value': [1, 2, 3]})
df2 = pd.DataFrame({'key': ['A', 'B', 'D'], 'value': [4, 5, 6]})
merged_df = pd.merge(df1, df2, on='key')
print("\nMerged DataFrame on 'key':")
print(merged_df)

# Saving Data
df.to_csv('output.csv', index=False)
print("\nDataFrame saved to 'output.csv'")


## Numpy

In [None]:
import numpy as np

arr = np.array([1, 2, 3, 4, 5])

print(arr)

print(type(arr))

[1 2 3 4 5]
<class 'numpy.ndarray'>


### Shapes of Np array

In [None]:
a = np.array(42)
b = np.array([1, 2, 3, 4, 5])
c = np.array([[1, 2, 3], [4, 5, 6]])
d = np.array([[[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]]])

print(a.ndim)
print(b.ndim)
print(c.ndim)
print(d.ndim)


0
1
2
3


In [None]:
import numpy as np

arr = np.array([1, 2, 3, 4])

print(arr[1])

In [None]:
arr = np.array([[1,2,3,4,5], [6,7,8,9,10]])

print('2nd element on 1st row: ', arr[0, 1])

In [None]:
import numpy as np

arr = np.array([1, 2, 3, 4, 5, 6, 7])

print(arr[4:])

[7]


In [None]:
import numpy as np

arr = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])

newarr = arr.reshape(4, 3)

print(newarr , newarr.shape)

[[ 1  2  3]
 [ 4  5  6]
 [ 7  8  9]
 [10 11 12]] (4, 3)


In [None]:
import numpy as np

arr = np.array([1, 2, 3])
#arr = np.array([[1, 2, 3], [4, 5, 6]])

for x in arr:
  print(x)

In [None]:
import numpy as np

# Simulating a dataset
# Rows represent students, columns represent subjects (Math, Science, English)
scores = np.array([
    [85, 90, 78],
    [88, 92, 80],
    [75, 85, 85],
    [90, 87, 88]
])

# Calculate the mean score for each student
mean_scores_students = np.mean(scores, axis=1)
print("Mean scores for each student:", mean_scores_students)

# Calculate the mean score for each subject
mean_scores_subjects = np.mean(scores, axis=0)
print("Mean scores for each subject:", mean_scores_subjects)

# Find the student with the highest average score
best_student_index = np.argmax(mean_scores_students)
print("Student with the highest average score:", best_student_index)

# Find the subject with the highest average score
best_subject_index = np.argmax(mean_scores_subjects)
subject_names = ['Math', 'Science', 'English']
print("Subject with the highest average score:", subject_names[best_subject_index])


Mean scores for each student: [84.33333333 86.66666667 81.66666667 88.33333333]
Mean scores for each subject: [84.5  88.5  82.75]
Student with the highest average score: 3
Subject with the highest average score: Science


In [None]:
import numpy as np

# Define two matrices
matrix_a = np.array([[1, 2], [3, 4]])
matrix_b = np.array([[5, 6], [7, 8]])

# Perform matrix multiplication
result = np.dot(matrix_a, matrix_b)

# Print the resulting matrix
print("Matrix A:")
print(matrix_a)
print("\nMatrix B:")
print(matrix_b)
print("\nResult of Matrix Multiplication:")
print(result)
