In [33]:
pip install hmmlearn

Collecting hmmlearn

  Downloading hmmlearn-0.3.2-cp310-cp310-win_amd64.whl (124 kB)

     ---------------------------------------- 0.0/124.5 kB ? eta -:--:--

     ------------ ------------------------ 41.0/124.5 kB 653.6 kB/s eta 0:00:01

     ----------------------------- ------ 102.4/124.5 kB 980.4 kB/s eta 0:00:01

     -------------------------------------- 124.5/124.5 kB 1.0 MB/s eta 0:00:00






Installing collected packages: hmmlearn

Successfully installed hmmlearn-0.3.2

Note: you may need to restart the kernel to use updated packages.


In [40]:
import numpy as np
import pandas as pd 
from hmmlearn import hmm

# Step 1: Data Preprocessing
# Load dataset
data = pd.read_csv("Travel details dataset.csv")

# Clean dataset (remove missing values, outliers, etc.)
data.dropna(inplace=True)

# Encode categorical variables
data = pd.get_dummies(data, columns=["Accommodation", "Gender"])

# Step 2: Feature Engineering
# Select relevant features
selected_features = ["Duration","Age"] 
# selected_features = ["Location","Departure Time", "Travel Duration (hours)","Arrival Time","Purpose","Mean of Trip", "Age","Gender","Occupation","Car Ownership"] # Add more features as needed
X = data[selected_features].values

# Normalize numerical features
# You can use Min-Max scaling, Z-score normalization, or other techniques
# Example of Min-Max scaling
X = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))

# Step 3: Training the HMM
# Choose the number of hidden states
n_states = 5

# Initialize and train the HMM
model = hmm.GaussianHMM(n_components=n_states, covariance_type="full", n_iter=100)
model.fit(X)

# Step 4: Generating Sequences
# Generate sequences of human activities
# You can specify the length of the sequence you want to generate
seq_length = 10
generated_seq = model.sample(n_samples=seq_length)

# Step 5: Decoding
# Decode the generated sequence of hidden states
hidden_states = generated_seq[1]
# Map hidden states to human activities based on emission probabilities
# You need to define the mapping based on your trained model
# For simplicity, let's assume a one-to-one mapping
# Replace this with your actual decoding logic
decoded_sequence = hidden_states

# Step 6: Evaluation
# Evaluate the generated sequence (compare with original dataset)
# Compute metrics like accuracy, precision, recall, etc.

# Example of printing the decoded sequence
print("Generated Sequence:")
print(decoded_sequence)





Generated Sequence:

[2 3 2 3 2 3 0 4 1 4]


In [4]:
import numpy as np
import pandas as pd 
from hmmlearn import hmm
from sklearn.preprocessing import LabelEncoder

# Step 1: Data Preprocessing
# Load dataset
data = pd.read_csv("Travel details dataset.csv")

# Clean dataset (remove missing values, outliers, etc.)
data.dropna(inplace=True)

# Encode categorical variables
label_encoders = {}
for column in ["Location", "Gender"]:
    label_encoders[column] = LabelEncoder()
    data[column] = label_encoders[column].fit_transform(data[column])

# Step 2: Feature Engineering
# Select relevant features
selected_features = ["Duration", "Age"] 
X = data[selected_features].values

# Normalize numerical features
X = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))

# Step 3: Training the HMM
n_states = 5
model = hmm.GaussianHMM(n_components=n_states, covariance_type="full", n_iter=100)
model.fit(X)

# Step 4: Generating Sequences
seq_length = 10
generated_seq, _ = model.sample(n_samples=seq_length)

# Step 5: Decoding
decoded_sequence = model.predict(generated_seq)

# Step 6: Decode hidden states to original string values
decoded_sequence_strings = []
for state in decoded_sequence:
    decoded_sequence_strings.append(label_encoders["Location"].inverse_transform([state])[0])

# Print the decoded sequence
print("Decoded Sequence (Original Values):")
print(decoded_sequence_strings)

Decoded Sequence (Original Values):

['Amsterdam', 'Australia', 'Australia', 'Australia', 'Australia', 'Auckland, New Zealand', 'Amsterdam, Netherlands', 'Auckland, New Zealand', 'Amsterdam, Netherlands', 'Athens, Greece']


In [3]:
import numpy as np
import pandas as pd 
from hmmlearn import hmm
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Step 1: Data Preprocessing
# Load dataset
data = pd.read_csv("Travel details dataset.csv")

# Clean dataset (remove missing values, outliers, etc.)
data.dropna(inplace=True)

# Encode categorical variables
label_encoders = {}
for column in ["Location", "Gender"]:
    label_encoders[column] = LabelEncoder()
    data[column] = label_encoders[column].fit_transform(data[column])

# Step 2: Feature Engineering
# Select relevant features
selected_features = ["Duration", "Age"] 
X = data[selected_features].values

# Normalize numerical features
X = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))

# Step 3: Training and Evaluation using Cross-Validation
n_states = 5
accuracy_scores = []
num_folds = 5  # Number of folds for cross-validation

# Perform cross-validation
for _ in range(num_folds):
    # Split data into training and test sets
    X_train, X_test = train_test_split(X, test_size=0.2, random_state=np.random.randint(100))
    
    # Train the HMM model
    model = hmm.GaussianHMM(n_components=n_states, covariance_type="full", n_iter=100)
    model.fit(X_train)
    
    # Generate sequences and decode
    seq_length = len(X_test)  # Use the length of the test set
    generated_seq, _ = model.sample(n_samples=seq_length)
    decoded_sequence = model.predict(generated_seq)
    
    # Decode hidden states to original string values
    decoded_sequence_strings = label_encoders["Location"].inverse_transform(decoded_sequence)
    
    # Compute accuracy
    accuracy = accuracy_score(label_encoders["Location"].inverse_transform(model.predict(X_test)), decoded_sequence_strings)
    accuracy_scores.append(accuracy)

# Calculate average accuracy
average_accuracy = np.mean(accuracy_scores)
print("Average Accuracy:", average_accuracy)


Average Accuracy: 0.3142857142857142


In [9]:
import numpy as np
from hmmlearn import hmm
from sklearn.preprocessing import LabelEncoder
import pandas as pd

# Initialize lists to store valid data
trip_durations = []
user_types = []
genders = []
birth_years = []

# Load data from CSV file
with open('trip.csv', 'r') as file:
    for line_number, line in enumerate(file, start=1):
        # Skip header
        if line_number == 1:
            continue
        # Attempt to parse the line
        try:
            fields = line.strip().split(',')
            if len(fields) != 12:  # Check if the number of fields matches the expected
                raise ValueError(f"Expected 12 fields, found {len(fields)} fields")
            trip_durations.append(float(fields[4]))  # Assuming tripduration is the 5th field
            user_types.append(fields[9])  # Assuming usertype is the 10th field
            genders.append(fields[10])  # Assuming gender is the 11th field
            birth_years.append(fields[11])  # Assuming birthyear is the 12th field
        except Exception as e:
            print(f"Error processing line {line_number}: {e}")

# Encode states (e.g., user types, genders, etc.)
label_encoder = LabelEncoder()
encoded_user_types = label_encoder.fit_transform(user_types)
encoded_genders = label_encoder.fit_transform(genders)
encoded_birth_years = label_encoder.fit_transform(birth_years)

# Combine features
X = np.column_stack([trip_durations, encoded_user_types, encoded_genders, encoded_birth_years])

# Create and fit HMM
model = hmm.GaussianHMM(n_components=2, covariance_type="full", n_iter=100)
model.fit(X)

# Predict states
hidden_states = model.predict(X)

# Calculate accuracy if ground truth is available
# accuracy = np.mean(hidden_states == ground_truth)
# print("Accuracy:", accuracy)

# Generate a sequence
sequence = hidden_states
print("Generated sequence:", sequence)


Error processing line 50794: Expected 12 fields, found 20 fields


Model is not converging.  Current: 131873.69181339158 is not greater than 131873.6918134356. Delta is -4.4034095481038094e-08


Generated sequence: [0 0 0 ... 1 1 1]
