In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append("../../../Optimus")

# Chapter Three - Data Wrangling

In [3]:
from optimus import Optimus 
op = Optimus("pandas")

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\LuisA\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [None]:
df = op.create.dataframe({
    "name": ["optimus", "bumblebee", "eject", 2],
    "job": ["Leader", "Espionage", 1, 3],
    "id": [1, 2, 3, 4],
    "birth": ["22/10/10", "22/08/08", "23/07/07", "22/10/10"]
})
df.display()

In [None]:
print(df.data)
print(type(df.data))

### Converting datatypes

In [None]:
try:
    df.data["name"].astype(int)
except Exception as e:
    print(e)

In [None]:
df.cols.to_integer("name")

In [None]:
df.cols.to_float("name")

In [None]:
df.cols.to_string("id")

In [None]:
df.cols.to_datetime("birth", "YY/mm/dd")

### Selecting columns

In [None]:
df.cols.select("job")

In [None]:
df.cols.select(["name", "job"])

In [None]:
df.cols.select(regex="n.*")

In [None]:
df.cols.select(data_type="int")

### Moving columns

In [None]:
df.cols.move(["name", "job"], "after", "id")

In [None]:
df.cols.move(["name", "job"], "end")

### Renaming columns

In [None]:
df2 = df.cols.rename(["name", "job"], ["string_name", "string_job"])
df2 = df2.cols.rename("id", "int_id")

### Removing columns

In [None]:
df.cols.drop("job")

In [None]:
df.cols.select(["job", "name"])

### Input and output columns

In [None]:
df.cols.upper("name")

In [None]:
df.cols.upper(["name", "job"])

In [None]:
df.cols.upper("name", output_cols="name_upper")

In [None]:
df.cols.upper(["name", "job"], output_cols=["name_upper", "job_upper"])

In [None]:
df.cols.upper(["name", "job"], "upper")

In [None]:
df.cols.upper("*")

In [None]:
df.cols.upper("name").cols.lower("job")

### More managing function

In [None]:
df.cols.drop("job")

In [None]:
df.cols.keep("id")

In [None]:
df.cols.rename("name", "first_name")

In [None]:
df.cols.duplicate(["name", "id"], ["first_name", "id_number"])

In [None]:
df.cols.set("last_name", None)

In [None]:
df.cols.set("last_name", None).cols.fill_na("last_name", "Placeholder Value")

### String functions

In [None]:
df.cols.upper("name")

In [None]:
df.cols.pad("id", 3, "0")

### Merge and split

In [None]:
df = op.create.dataframe({ 
    "xy_position": ["42.8, 7.7", "23.3, 25.1", "35.6, 50.5", "52.7, 67.4"],
    "depth": [10.5, 50.1, 20.2, 97.0]
})

In [None]:
df.cols.unnest("xy_position", separator=", ", output_cols=["x", "y"], drop=True)

In [None]:
df.cols.nest(["xy_position", "depth"], separator=", ", output_col="xyz_position", drop=True)

### Search and replace

In [None]:
df = op.create.dataframe({
    "name": ["Optimus, Prime", " Arcee, Ariel",  "Bumblebee/Maggiolino"]
})


In [None]:
df.cols.replace("name", [", ", "/"], " ", search_by="chars")

### Numeric functions

In [None]:
df = op.create.dataframe({ "values": [0.5, 2, "3.14"] })

In [None]:
df.cols.sin("values")

In [None]:
df.cols.log("values", 5)

### Date and time Functions

In [None]:
df = op.create.dataframe({ "date": ["09/01/2021", "30/09/2020", "03/07/2020", "21/01/2030"] })

In [None]:
df.cols.year("date")

In [None]:
df.cols.date_format("date", output_format="%m-%d-%Y")

### URL Functions

In [None]:
from optimus import Optimus
op = Optimus("pandas") 
 
data = {"A": ["https://www.hi-optimus.com:8080/index.php?a=1"]}  
df = op.create.dataframe(data)
 
df["port"] = df.cols.port("A")
df["subdomain"] = df.cols.sub_domain("A")
df["domain"] = df.cols.domain("A")

df.display()

### Email functions

In [None]:
df = op.create.dataframe({"A": ["optimus@cybertron.com"]})
 
df["username"] = df.cols.email_username("A")
df["domain"] = df.cols.email_domain("A")

df.display()

### UDF

In [None]:
data = {"A":[0,1,2,3,4,5],"B":[6,7,8,9,10,11]} 
df = op.create.dataframe(data)
df.display()

In [None]:
def add_two(pandas_series):
    print(type(pandas_series))
    return pandas_series + 2
 
df.cols.apply("A", add_two)


In [None]:
def add_two(single_value):    
    print(type(single_value))
    return single_value + 2
 
df.cols.apply("A", add_two, mode="map").print()

In [None]:
def add_two(value):
    return value + 2

print(df.cols.apply("*", add_two))

In [4]:
df = op.load.file("DCIGNP2AYL.txt")
df.display()

ean_hotel_id  1 (int64),name  2 (object),address1  3 (object),city  4 (object),state_province  5 (object),postal_code  6 (object),latitude  7 (float64),longitude  8 (float64),star_rating  9 (float64),high_rate  10 (float64),low_rate  11 (float64)
269955,Hilton⋅Garden⋅Inn⋅Albany/SUNY⋅Area,1389⋅Washington⋅Ave,Albany,NY,12206,42.68751,-73.81643,3.0,154.0272,124.0216
113431,Courtyard⋅by⋅Marriott⋅Albany⋅Thruway,1455⋅Washington⋅Avenue,Albany,NY,12206,42.68971,-73.82021,3.0,179.01,134.0
108151,Radisson⋅Hotel⋅Albany,205⋅Wolf⋅Rd,Albany,NY,12205,42.7241,-73.79822,3.0,134.17,84.16
254756,Hilton⋅Garden⋅Inn⋅Albany⋅Medical⋅Center,62⋅New⋅Scotland⋅Ave,Albany,NY,12208,42.65157,-73.77638,3.0,308.2807,228.4597
198232,CrestHill⋅Suites⋅SUNY⋅University⋅Albany,1415⋅Washington⋅Avenue,Albany,NY,12206,42.68873,-73.81854,3.0,169.39,89.39
125200,The⋅Desmond⋅Hotel⋅Albany,660⋅Albany⋅Shaker⋅Rd,Albany,NY,12211,42.72874,-73.79807,3.5,189.0266,153.0644
109728,Ramada⋅Plaza⋅Albany,3⋅Watervliet⋅Avenue⋅Ext,Albany,NY,12206,42.68031,-73.78444,3.0,158.6321,89.036
235037,Hampton⋅Inn⋅&⋅Suites⋅Albany-Downtown,25⋅Chapel⋅St,Albany,NY,12210,42.65334,-73.75142,2.5,225.47,224.47
106464,Albany⋅Marriott,189⋅Wolf⋅Rd,Albany,NY,12205,42.72111,-73.80036,3.5,158.8856,128.9077
106922,Best⋅Western⋅Sovereign⋅Hotel⋅-⋅Albany,1228⋅Western⋅Ave,Albany,NY,12203,42.67807,-73.82819,3.0,139.0244,78.3255


In [None]:
from optimus.functions import F

def haversine(lat1, lon1, lat2, lon2):    
    MILES = 3959
    lat1, lon1, lat2, lon2 = map(F.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1 
    dlon = lon2 - lon1         
    a = F.sin(dlat/2)**2 + F.cos(lat1) * F.cos(lat2) * F.sin(dlon/2)**2
    c = 2 * F.asin(F.sqrt(a))  
    total_miles = MILES * c
    print(total_miles)
    return total_miles

In [None]:
%%time
df["distance"] = haversine(40.671, -73.985, df['latitude'], df['longitude'])

In [7]:
import pandas as pd
import numpy as np
pd.Series(np.radians(df.data['latitude'])) - pd.Series(np.radians(df.data['longitude']))

0       2.033377
1       2.033482
2       2.033698
3       2.032051
4       2.033435
          ...   
1626    2.004572
1627    2.003978
1628    2.003300
1629    2.003692
1630    2.004557
Length: 1631, dtype: float64

In [None]:
(df.cols.select("distance"))

In [None]:
df

In [None]:
def udf(df):
    return df["id"].cols.cos()+ df["id"].cols.sin()

udf(df)
# df.cols.assign({"a":udf})