In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from scipy.stats import zscore
import plotly.graph_objects as go

In [2]:
df=pd.read_csv('mumbai-monthly-rains.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 121 entries, 0 to 120
Data columns (total 14 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Year    121 non-null    int64  
 1   Jan     121 non-null    float64
 2   Feb     121 non-null    float64
 3   Mar     121 non-null    float64
 4   April   121 non-null    float64
 5   May     121 non-null    float64
 6   June    121 non-null    float64
 7   July    121 non-null    float64
 8   Aug     121 non-null    float64
 9   Sept    121 non-null    float64
 10  Oct     121 non-null    float64
 11  Nov     121 non-null    float64
 12  Dec     121 non-null    float64
 13  Total   121 non-null    float64
dtypes: float64(13), int64(1)
memory usage: 13.4 KB


In [4]:
df.isnull().sum()

Unnamed: 0,0
Year,0
Jan,0
Feb,0
Mar,0
April,0
May,0
June,0
July,0
Aug,0
Sept,0


In [6]:
df.drop_duplicates(inplace=True)
df['Total']=pd.to_numeric(df['Total'])

In [8]:
x=df[['Year']]
y=df['Total']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
model=LinearRegression()
model.fit(x_train, y_train)
predict=model.predict(x_test)
predict

array([2112.46487111, 2121.85296895, 1987.2902333 , 2146.88789651,
       2056.13628409, 2175.05219002, 2203.21648352, 2006.06642897,
       2099.94740733, 2312.74429161, 2031.10135653, 2168.79345813,
       2009.19579492, 2087.42994355, 2256.41570459, 2344.03795106,
       2319.0030235 , 1974.77276952, 2253.28633865, 2300.22682783,
       2178.18155596, 2115.59423706, 2071.78311382, 2193.82838569,
       2106.20613922])

In [9]:
accuracy_score=r2_score(y_test, predict)
accuracy_score

0.23525577280118548

In [10]:
from scipy.stats import zscore
df['z_score_total'] = zscore(df['Total'])

In [11]:
df['z_score_total']
threshold=2.5
df['anomaly']=df['z_score_total'].abs()>threshold

In [7]:
import plotly.express as px

fig = px.line(df, x='Year', y='Total',
              markers=True,           # adds markers like your Matplotlib version
              title='Year vs Total',
              labels={'Year': 'Year', 'Total': 'Total'})  # axis labels

fig.update_traces(line=dict(color='blue', dash='dash'),  # dashed blue line
                  marker=dict(symbol='circle', size=6))   # marker style

fig.show()

In [12]:
import plotly.graph_objects as go

fig = go.Figure()

# Original data line
fig.add_trace(go.Scatter(
    x=df['Year'],
    y=df['Total'],
    mode='lines',
    name='Original Data'
))

# Anomalies as red dots
fig.add_trace(go.Scatter(
    x=df[df['anomaly']]['Year'],
    y=df[df['anomaly']]['Total'],
    mode='markers',
    marker=dict(color='red', size=8),
    name='Anomalies'
))

# Layout
fig.update_layout(
    title='Year vs Total with Anomalies',
    xaxis_title='Year',
    yaxis_title='Total',
    width=1000,
    height=500
)

fig.show()
fig.write_html('plotly_chart.html')