In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np

# Load the dataset
file_path = 'CrimesOnWomenData.csv'
data = pd.read_csv(file_path)

# Preprocess the data
data = data.dropna()
data['Year'] = pd.to_numeric(data['Year'], errors='coerce')
data['Rape'] = pd.to_numeric(data['Rape'], errors='coerce')
data = data.dropna(subset=['Rape', 'Year'])

# Feature and target variable
X = data[['Year']]
y = data['Rape']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

# Create a DataFrame for future years
future_years = pd.DataFrame({'Year': np.arange(X['Year'].min(), X['Year'].max() + 10)})
future_predictions = model.predict(future_years)

# Plot the actual data and the predictions
plt.figure(figsize=(14, 7))
plt.scatter(data['Year'], data['Rape'], color='blue', label='Actual Rape Incidents')
plt.plot(future_years, future_predictions, color='red', linestyle='--', label='Predicted Rape Incidents')
plt.xlabel('Year')
plt.ylabel('Number of Rape Incidents')
plt.title('Rape Incidents vs Year')
plt.legend()
plt.grid(True)
plt.show()