In [39]:
#Importing Libraries
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
import numpy as np

In [40]:
#Making Dataframe
df=pd.read_csv('sample_data.csv')
df.head()
df.shape
df.drop(df.tail(12501).index,inplace=True)

In [3]:
df.shape

(1000, 4)

In [102]:
df

Unnamed: 0,mmsi,timestamp,lat,lon
0,565761000,2023-03-15 00:27:44+00,1.268780,103.75827
1,538008084,2023-03-19 23:30:00+00,43.559620,10.29404
2,564654000,2023-03-12 08:22:53+00,1.237250,103.89135
3,529123000,2023-03-05 16:47:42+00,29.443670,48.93066
4,564780000,2023-03-11 06:35:20+00,1.277550,103.61026
...,...,...,...,...
995,538008064,2023-03-18 13:30:00+00,8.832311,-79.54650
996,563078430,2023-03-06 06:57:31+00,1.243280,103.75625
997,563014650,2023-03-21 19:31:25+00,1.228590,103.88587
998,564654000,2023-03-19 21:37:08+00,1.217320,103.78894


In [4]:
#Converting lat and lon into point
Points=df.apply(lambda row: Point(row.lon,row.lat),axis=1)
Points

0                          POINT (103.75827 1.26878)
1      POINT (10.294040476292777 43.559619523707326)
2                          POINT (103.89135 1.23725)
3                          POINT (48.93066 29.44367)
4                          POINT (103.61026 1.27755)
                           ...                      
995               POINT (-79.5465 8.832310715342649)
996                        POINT (103.75625 1.24328)
997                        POINT (103.88587 1.22859)
998                        POINT (103.78894 1.21732)
999                        POINT (50.13057 28.76039)
Length: 1000, dtype: object

In [5]:
#Converting to geodataframe
Data=gpd.GeoDataFrame(df,geometry=Points)
Data.crs={'init':'epsg:4326'}
Data.head()

  in_crs_string = _prepare_from_proj_string(in_crs_string)


Unnamed: 0,mmsi,timestamp,lat,lon,geometry
0,565761000,2023-03-15 00:27:44+00,1.26878,103.75827,POINT (103.75827 1.26878)
1,538008084,2023-03-19 23:30:00+00,43.55962,10.29404,POINT (10.29404 43.55962)
2,564654000,2023-03-12 08:22:53+00,1.23725,103.89135,POINT (103.89135 1.23725)
3,529123000,2023-03-05 16:47:42+00,29.44367,48.93066,POINT (48.93066 29.44367)
4,564780000,2023-03-11 06:35:20+00,1.27755,103.61026,POINT (103.61026 1.27755)


In [6]:
#Haversine Function
def haversine(cord1, cord2):
    # Convert latitude and longitude from degrees to radians
    lat1, lon1=cord1
    lat2, lon2 = cord2
    # Haversine formula
    dlon = np.radians(lon2 - lon1)
    dlat = np.radians(lat2 - lat1)
    a = np.sin(dlat/2)**2 + np.cos(np.radians(lat1)) * np.cos(np.radians(lat2)) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a))
    r = 6371  # Radius of earth in kilometers. Use 3956 for miles
    return c * r

In [7]:
#Getting Coordinates from the Dataframe
coords=Data.geometry.apply(lambda point:(point.y,point.x)).tolist()

In [8]:
#Calculating the distance between each pair of coords using Haversine function and creating a flattened matrix for it
from scipy.spatial.distance import pdist,squareform
distance_matrix=squareform(pdist(coords,lambda u,v: haversine(u,v)))

In [11]:
#Creating a pandas matrix having same row and column
distance_df=pd.DataFrame(distance_matrix,index=Data.mmsi,columns=Data.mmsi)
distance_df

mmsi,565761000,538008084,564654000,529123000,564780000,563014650,563014650,564654000,563078430,564654000,...,564654000,564780000,563014650,563078430,564654000,538008064,563078430,563014650,564654000,529123000
mmsi,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
565761000,0.000000,10189.258322,15.204034,6580.100895,16.482771,21.635983,12.842560,15.434520,13.471887,6.991333,...,9.836609,19.352041,29.465679,6.620080,14.395723,18833.709573,2.844349,14.872405,6.660880,6445.696187
538008084,10189.258322,0.000000,10202.383337,3750.431120,10176.678210,10202.357054,10196.106591,10202.900337,10200.604324,10196.162970,...,10198.351991,10204.689263,10209.403048,10188.724640,10199.766648,9319.407245,10191.052827,10202.607208,10195.674899,3889.440521
564654000,15.204034,10202.383337,0.000000,6594.326627,31.567930,9.545634,19.572019,0.688694,1.825833,10.753476,...,12.114759,4.620542,15.508762,19.755294,3.396074,18832.376761,15.033875,1.139475,11.598543,6459.957213
529123000,6580.100895,3750.431120,6594.326627,0.000000,6565.919329,6595.907671,6585.041191,6594.759916,6592.503893,6586.724714,...,6588.439986,6597.272680,6603.576106,6578.472428,6592.110847,13052.395779,6581.501053,6594.359806,6586.128216,139.150880
564780000,16.482771,10176.678210,31.567930,6565.919329,0.000000,37.896790,19.429567,31.731866,29.889026,21.700273,...,22.745451,35.821497,45.878038,12.709425,30.868895,18837.758500,16.670786,31.119013,20.962252,6431.459374
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
538008064,18833.709573,9319.407245,18832.376761,13052.395779,18837.758500,18823.702598,18846.466667,18833.007214,18831.998268,18837.673728,...,18841.008177,18829.711536,18822.732694,18839.422300,18829.309966,0.000000,18836.475512,18833.484283,18838.087704,13191.299307
563078430,2.844349,10191.052827,15.033875,6581.501053,16.670786,22.476513,10.024564,15.139863,13.450595,5.229152,...,7.460558,19.461535,29.981966,4.970644,14.867335,18836.475512,0.000000,14.502020,4.641060,6447.079919
563014650,14.872405,10202.607208,1.139475,6594.359806,31.119013,10.665793,18.594285,0.724252,2.123819,10.005978,...,11.150890,5.528324,16.418830,19.125677,4.340578,18833.484283,14.502020,0.000000,10.848293,6459.982827
564654000,6.660880,10195.674899,11.598543,6586.128216,20.962252,20.147778,9.086990,11.546726,10.306185,0.845067,...,3.319532,16.204428,27.035661,8.452928,12.325529,18838.087704,4.641060,10.848293,0.000000,6451.704607


In [62]:
#Considering the distance between the vessels greater than one km and less than 5 km will interact
filtered_list=distance_df[(distance_df<5)& (distance_df!=0.0)&(distance_df>1)]
filtered_list
filtered_list.shape

(1000, 1000)

In [63]:
#Function to get the list of non null values as the above filtered list will have null values
def Non_Null_Col(row):
    return row.dropna().index.tolist()

In [94]:
#Series contains all the vessel mmsi which satisfy above category
l=filtered_list.apply(Non_Null_Col,axis=1)

In [107]:
pf=pd.DataFrame(l)
pf

Unnamed: 0_level_0,0
mmsi,Unnamed: 1_level_1
565761000,"[564654000, 563014650, 563078430, 564780000, 5..."
538008084,[538008084]
564654000,"[563078430, 564654000, 564780000, 564654000, 5..."
529123000,[529123000]
564780000,[]
...,...
538008064,"[352656000, 352656000, 352656000, 352656000, 3..."
563078430,"[565761000, 565761000, 564654000, 564654000, 5..."
563014650,"[564654000, 563078430, 564654000, 564780000, 5..."
564654000,"[564654000, 564654000, 564654000, 564654000, 5..."


In [108]:
pf.reset_index(inplace=True)
pf

Unnamed: 0,mmsi,0
0,565761000,"[564654000, 563014650, 563078430, 564780000, 5..."
1,538008084,[538008084]
2,564654000,"[563078430, 564654000, 564780000, 564654000, 5..."
3,529123000,[529123000]
4,564780000,[]
...,...,...
995,538008064,"[352656000, 352656000, 352656000, 352656000, 3..."
996,563078430,"[565761000, 565761000, 564654000, 564654000, 5..."
997,563014650,"[564654000, 563078430, 564654000, 564780000, 5..."
998,564654000,"[564654000, 564654000, 564654000, 564654000, 5..."


In [109]:
pf['Timestamp']=df['timestamp']

In [110]:
pf

Unnamed: 0,mmsi,0,Timestamp
0,565761000,"[564654000, 563014650, 563078430, 564780000, 5...",2023-03-15 00:27:44+00
1,538008084,[538008084],2023-03-19 23:30:00+00
2,564654000,"[563078430, 564654000, 564780000, 564654000, 5...",2023-03-12 08:22:53+00
3,529123000,[529123000],2023-03-05 16:47:42+00
4,564780000,[],2023-03-11 06:35:20+00
...,...,...,...
995,538008064,"[352656000, 352656000, 352656000, 352656000, 3...",2023-03-18 13:30:00+00
996,563078430,"[565761000, 565761000, 564654000, 564654000, 5...",2023-03-06 06:57:31+00
997,563014650,"[564654000, 563078430, 564654000, 564780000, 5...",2023-03-21 19:31:25+00
998,564654000,"[564654000, 564654000, 564654000, 564654000, 5...",2023-03-19 21:37:08+00


In [None]:
   #Above is the dataframe that is the output 