# Data Transformation

In [2]:
import pandas as pd

In [4]:
s = pd.Series(['cat', 'cow', 'dog'])
print("Series:\n")
s

Series:



0    cat
1    cow
2    dog
dtype: object

In [6]:
print("Mapping: ")

s.map({'cat': 'kitten', 'cow': 'calf'})

Mapping: 


0    kitten
1      calf
2       NaN
dtype: object

In [12]:
import pandas as pd

df = pd.DataFrame(
	[('carrot', 'red', 1), 
	('papaya', 'yellow', 0),
	('mango', 'yellow', 0), 
	('apple', 'red', 0)
	], 
    
	columns=['species', 'color', 'type']
)

In [10]:
df

Unnamed: 0,species,color,type
0,carrot,red,1
1,papaya,yellow,0
2,mango,yellow,0
3,apple,red,0


In [14]:
print("Dataframe before Mapping: ")
print(df)



Dataframe before Mapping: 
  species   color  type
0  carrot     red     1
1  papaya  yellow     0
2   mango  yellow     0
3   apple     red     0


In [16]:
mappings = {
	'carrot': 'veg',
	'papaya': 'fruit',
    'mango':'king',
    'apple':"fruit"
}

mappings

{'carrot': 'veg', 'papaya': 'fruit', 'mango': 'king', 'apple': 'fruit'}

In [18]:
df['type'] = df['species'].map(mappings)

print("Dataframe after Mapping: ")
print(df)


Dataframe after Mapping: 
  species   color   type
0  carrot     red    veg
1  papaya  yellow  fruit
2   mango  yellow   king
3   apple     red  fruit


## New Dataframe

In [20]:
data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon','Pastrami', 'corned beef', 'Bacon',
                            'pastrami', 'honey ham', 'nova lox'],
                     'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})

data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,Pastrami,6.0
4,corned beef,7.5
5,Bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [23]:
meat_to_animal = {
'bacon': 'pig',
'pulled pork': 'pig',
'pastrami': 'cow',
'corned beef': 'cow',
'honey ham': 'pig',
'nova lox': 'salmon'
}

In [25]:
data['food'] = data['food'].str.lower()
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,pastrami,6.0
4,corned beef,7.5
5,bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [27]:
data['animal'] = data.food.map(meat_to_animal)
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,pastrami,6.0,cow
4,corned beef,7.5,cow
5,bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


## Bins

In [29]:
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
# continuous 

In [31]:
bins = [18, 25, 35, 60, 100]

In [33]:
cats = pd.cut(ages, bins)

In [37]:
cats

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64, right]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

### The **purpose of `bins`** in this code is to define the **intervals (ranges)** that will be used to categorize the continuous values in the `ages` list. 

### **How Bins Work:**
- `bins = [18, 25, 35, 60, 100]`  
  - This creates **four intervals**:
    1. **(18, 25]** → Ages greater than 18 and up to 25  
    2. **(25, 35]** → Ages greater than 25 and up to 35  
    3. **(35, 60]** → Ages greater than 35 and up to 60  
    4. **(60, 100]** → Ages greater than 60 and up to 100  

- These bins **segment** the continuous age data into discrete groups.

### **Why Use Bins?**
- **Data Categorization**: Converts continuous numerical values into categorical groups.
- **Easier Analysis**: Helps in summarizing and visualizing data.
- **Useful in Machine Learning**: Some models work better with categorized data rather than raw continuous values.
- **Better Interpretability**: Instead of looking at raw numbers, we can group them into meaningful age ranges.

Let me know if you need further clarification! 🚀

### The cut() function in pandas is used to categorize (bin) continuous numerical data into discrete intervals (bins).

Functionality of pd.cut()
It assigns each value in the input array (e.g., ages) to one of the predefined bins.
The bins define the range of values, and cut() determines which bin each value falls into.
The output is a categorical object where each value is replaced by the corresponding bin label.

In [42]:
cats.codes

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [44]:
cats.categories

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]], dtype='interval[int64, right]')

In [46]:
pd.value_counts(cats)

  pd.value_counts(cats)


(18, 25]     5
(25, 35]     3
(35, 60]     3
(60, 100]    1
Name: count, dtype: int64

## New Data

In [48]:
data = pd.read_csv("weight-height.csv")
data

Unnamed: 0,Gender,Height,Weight
0,Male,73.847017,241.893563
1,Male,68.781904,162.310473
2,Male,74.110105,212.740856
3,Male,71.730978,220.042470
4,Male,69.881796,206.349801
...,...,...,...
9995,Female,66.172652,136.777454
9996,Female,67.067155,170.867906
9997,Female,63.867992,128.475319
9998,Female,69.034243,163.852461


In [51]:
data.Weight

0       241.893563
1       162.310473
2       212.740856
3       220.042470
4       206.349801
           ...    
9995    136.777454
9996    170.867906
9997    128.475319
9998    163.852461
9999    113.649103
Name: Weight, Length: 10000, dtype: float64

In [53]:
data.Weight.describe()

count    10000.000000
mean       161.440357
std         32.108439
min         64.700127
25%        135.818051
50%        161.212928
75%        187.169525
max        269.989698
Name: Weight, dtype: float64

In [55]:
bins = [64,100,150,200,250,300]

In [57]:
categor = pd.cut(data.Weight, bins)

In [59]:
pd.value_counts(categor)

  pd.value_counts(categor)


Weight
(150, 200]    4717
(100, 150]    3857
(200, 250]    1272
(64, 100]      146
(250, 300]       8
Name: count, dtype: int64