In [61]:
# Import libraries
import glob
import pandas as pd
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import numpy as np
import datetime
from dateutil import parser

In [47]:
# Get CSV files list from a folder
path = 'Weather_Data'
csv_files = glob.glob(path + "/*.csv")

df_list = (pd.read_csv(file).iloc[1:] for file in csv_files)

# Concatenate all DataFrames
weather_data = pd.concat(df_list, ignore_index=True)

  objs = list(objs)


In [108]:
# importing AIS dataset here
df_ais = pd.read_csv("./MarineTraffic/AIS_2022_06_30.csv")
df_ais.rename(columns={'LAT': 'latitude', 'LON': 'longitude'}, inplace=True)
df_ais.head()

Unnamed: 0,MMSI,BaseDateTime,latitude,longitude,SOG,COG,Heading,VesselName,IMO,CallSign,VesselType,Status,Length,Width,Draft,Cargo,TransceiverClass
0,366999658,2022-06-30T00:00:00,30.12422,-85.63672,0.0,108.0,18,CG COBIA,,NTXJ,90.0,0.0,26.0,5.0,,55.0,A
1,538004028,2022-06-30T00:00:01,25.67896,-79.61538,9.2,180.2,182,RIO GRANDE,IMO9593438,V7UT9,80.0,0.0,274.0,48.0,17.1,80.0,A
2,367684960,2022-06-30T00:00:02,40.71821,-89.54574,0.0,360.0,511,CAROLE K,,WDI3100,31.0,15.0,15.0,,,31.0,A
3,367533020,2022-06-30T00:00:03,31.10169,-91.60765,6.9,304.5,305,MARTHA LYNN,IMO8836637,WDG4104,31.0,12.0,54.0,15.0,,57.0,A
4,368103510,2022-06-30T00:00:04,30.42336,-91.19565,0.0,239.3,511,ANDY MCKINNEY,,WDK9416,31.0,0.0,24.0,10.0,3.0,31.0,A


In [3]:
weather_data.head()

Unnamed: 0,time,depth,latitude,longitude,Tdir,Tper,Thgt,sdir,sper,shgt,wdir,wper,whgt
0,2022-09-08T19:00:00Z,0.0,-77.5,0.0,,,,,,,,,
1,2022-09-08T19:00:00Z,0.0,-77.5,0.5,,,,,,,,,
2,2022-09-08T19:00:00Z,0.0,-77.5,1.0,,,,,,,,,
3,2022-09-08T19:00:00Z,0.0,-77.5,1.5,,,,,,,,,
4,2022-09-08T19:00:00Z,0.0,-77.5,2.0,,,,,,,,,


In [4]:
def missing_statistics(df):
    statitics = pd.DataFrame(df.isnull().sum()).reset_index()
    statitics.columns = ['COLUMN NAME', "MISSING VALUES"]
    statitics['TOTAL ROWS'] = df.shape[0]
    statitics['% MISSING'] = round(
        (statitics['MISSING VALUES']/statitics['TOTAL ROWS'])*100, 2)
    return statitics


missing_statistics(weather_data)


Unnamed: 0,COLUMN NAME,MISSING VALUES,TOTAL ROWS,% MISSING
0,time,0,12539520,0.0
1,depth,0,12539520,0.0
2,latitude,0,12539520,0.0
3,longitude,0,12539520,0.0
4,Tdir,4835194,12539520,38.56
5,Tper,4833808,12539520,38.55
6,Thgt,4833808,12539520,38.55
7,sdir,5445000,12539520,43.42
8,sper,5445000,12539520,43.42
9,shgt,5445000,12539520,43.42


In [150]:
# Dropping rows with na
df_ais.dropna(inplace=True, axis=0)
weather_data.dropna(inplace=True, axis=0)
weather_data.reset_index(drop=True, inplace=True)

In [6]:
def convertISOtoLocal(isoString):
    return parser.parse(isoString).replace(tzinfo=datetime.timezone.utc).astimezone().strftime("%d/%m/%Y %I:%M:%S %p")


In [7]:
# converting all the ISO timestrings in
# "time" attribute to IST timezone
# weather_data["time"] = weather_data["time"].apply(lambda x: convertISOtoLocal(x))


In [8]:
weather_data.head()


Unnamed: 0,time,depth,latitude,longitude,Tdir,Tper,Thgt,sdir,sper,shgt,wdir,wper,whgt
0,2022-09-08T19:00:00Z,0.0,-77.0,207.5,121.0,1.9193857,0.17,18.0,3.04,0.06,124.0,1.92,0.14999999
1,2022-09-08T19:00:00Z,0.0,-77.0,208.0,129.0,1.8832392,0.17,6.0,3.22,0.06,138.0,1.87,0.16
2,2022-09-08T19:00:00Z,0.0,-76.5,207.5,102.0,2.173913,0.14999999,21.0,2.8799999,0.06,110.0,2.1499999,0.14
3,2022-09-08T19:00:00Z,0.0,-76.0,211.5,66.0,2.5188916,0.12,38.0,2.87,0.08,109.0,2.1599998,0.089999996
4,2022-09-08T19:00:00Z,0.0,-76.0,212.0,31.0,2.8985505,0.12,39.0,2.8999999,0.08,126.0,1.4499999,0.08


In [9]:
# getting unique values of time ranges
weather_data = weather_data.iloc[1:]
allTimeRanges = weather_data["time"].unique()

# removing all occurence of the word "UTC"
allTimeRanges = allTimeRanges[allTimeRanges != "UTC"]
allTimeRanges


array(['2022-09-08T19:00:00Z', '2022-09-08T20:00:00Z',
       '2022-09-08T21:00:00Z', '2022-09-08T22:00:00Z',
       '2022-09-08T23:00:00Z', '2022-09-09T00:00:00Z',
       '2022-09-09T01:00:00Z', '2022-09-08T00:00:00Z',
       '2022-09-08T01:00:00Z', '2022-09-08T02:00:00Z',
       '2022-09-08T03:00:00Z', '2022-09-08T04:00:00Z',
       '2022-09-08T05:00:00Z', '2022-09-08T06:00:00Z',
       '2022-09-08T07:00:00Z', '2022-09-08T08:00:00Z',
       '2022-09-08T09:00:00Z', '2022-09-08T10:00:00Z',
       '2022-09-08T11:00:00Z', '2022-09-08T12:00:00Z',
       '2022-09-08T13:00:00Z', '2022-09-08T14:00:00Z',
       '2022-09-08T15:00:00Z', '2022-09-08T16:00:00Z',
       '2022-09-08T17:00:00Z', '2022-09-08T18:00:00Z',
       '2022-09-09T02:00:00Z', '2022-09-09T03:00:00Z',
       '2022-09-09T04:00:00Z', '2022-09-09T05:00:00Z',
       '2022-09-09T06:00:00Z', '2022-09-09T07:00:00Z',
       '2022-09-09T08:00:00Z', '2022-09-09T09:00:00Z',
       '2022-09-09T10:00:00Z', '2022-09-09T11:00:00Z',
       '20

In [10]:
# working on only 1 time range for now
x_subset = weather_data.loc[weather_data["time"] == allTimeRanges[0]]
x_subset.shape

(140879, 13)

In [153]:
# rounding all AIS lat and lng to nearest location 
def roundToNearestLoc(val):
    return round(val * 2) / 2

df_ais["latitude"] = df_ais["latitude"].apply(lambda x: roundToNearestLoc(x))
# adding 180 to the longitude to make it in the range of 0-360
df_ais["longitude"] = df_ais["longitude"].apply(lambda x: roundToNearestLoc(x) + 180)
df_ais[["latitude", "longitude"]]

Unnamed: 0,latitude,longitude
1,25.5,280.5
8,32.0,279.0
10,34.0,284.0
12,40.5,286.0
13,33.0,282.5
...,...,...
1048566,49.5,237.0
1048567,40.5,286.0
1048569,41.0,289.0
1048570,29.5,266.5


In [154]:
from pandas.api.types import is_numeric_dtype, is_string_dtype

weather_data["latitude"] = pd.to_numeric(weather_data["latitude"])
weather_data["longitude"] = pd.to_numeric(weather_data["longitude"])

In [163]:
# direct matches
df = pd.merge(weather_data, df_ais, on=['latitude','longitude'], how='outer', indicator='Exist')
df['Exist'] = np.where(df.Exist == 'both', True, False)


In [156]:
print("Exact match present:", round(((df["Exist"] == True).sum() / df.shape[0]) * 100), "%")
print("Exact match not present:", round(((df["Exist"] == False).sum() / df.shape[0]) * 100), "%")

Exact match present: 49 %
Exact match not present: 51 %


In [164]:
df.shape

Unnamed: 0,time,depth,latitude,longitude,Tdir,Tper,Thgt,sdir,sper,shgt,...,IMO,CallSign,VesselType,Status,Length,Width,Draft,Cargo,TransceiverClass,Exist
0,2022-09-08T19:00:00Z,0.0,-77.0,207.5,121.0,1.9193857,0.17,18.0,3.04,0.06,...,,,,,,,,,,False
1,2022-09-08T20:00:00Z,0.0,-77.0,207.5,123.0,1.934236,0.16,17.0,3.05,0.05,...,,,,,,,,,,False
2,2022-09-08T11:00:00Z,0.0,-77.0,207.5,34.0,2.932551,0.22,41.0,2.94,0.18,...,,,,,,,,,,False
3,2022-09-08T12:00:00Z,0.0,-77.0,207.5,27.0,2.97619,0.21,47.0,2.98,0.18,...,,,,,,,,,,False
4,2022-09-08T13:00:00Z,0.0,-77.0,207.5,26.0,2.994012,0.19,29.0,3.0,0.13,...,,,,,,,,,,False


In [167]:
df = df.loc[df["Exist"] == True]
df.reset_index(inplace=True, drop= True)
df.head()
df.to_csv("Draft Cleaned Merged.csv")