In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import warnings
warnings.filterwarnings('ignore')

## 1. Data set reference link: https://www.consumerfinance.gov/data-research/consumer-complaints/#download-the-data
## File data source: https://files.consumerfinance.gov/ccdb/complaints.csv.zip 
## Problem statement:
## Download the data from the file data source and provide possible data insights.

## The dataset contains over 5 million values it cant be handled very easy so we are taking 2 million data for our insights

In [43]:
df = pd.read_csv("complaints.csv", nrows=2000000)

In [4]:
df.shape

(2000000, 18)

In [16]:
df.head()

Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Company,State,ZIP code,Consumer consent provided?,Submitted via,Date sent to company,Company response to consumer,Timely response?,Complaint ID
0,2024-01-23,Credit reporting or other personal consumer re...,Credit reporting,Incorrect information on your report,Information belongs to someone else,"TRANSUNION INTERMEDIATE HOLDINGS, INC.",ME,04005,Consent not provided,Web,2024-01-23,Closed with non-monetary relief,Yes,8206605
1,2024-01-24,Credit reporting or other personal consumer re...,Credit reporting,Incorrect information on your report,Information belongs to someone else,"TRANSUNION INTERMEDIATE HOLDINGS, INC.",FL,33311,Other,Web,2024-01-24,Closed with non-monetary relief,Yes,8211390
2,2024-01-24,Credit reporting or other personal consumer re...,Credit reporting,Improper use of your report,Credit inquiries on your report that you don't...,"TRANSUNION INTERMEDIATE HOLDINGS, INC.",PA,175XX,Other,Web,2024-01-24,Closed with non-monetary relief,Yes,8211362
3,2024-01-23,Credit reporting or other personal consumer re...,Credit reporting,Improper use of your report,Reporting company used your report improperly,"TRANSUNION INTERMEDIATE HOLDINGS, INC.",TX,79907,Consent provided,Web,2024-01-23,Closed with non-monetary relief,Yes,8210433
4,2024-01-23,Credit reporting or other personal consumer re...,Credit reporting,Improper use of your report,Reporting company used your report improperly,"TRANSUNION INTERMEDIATE HOLDINGS, INC.",NY,10075,Consent provided,Web,2024-01-23,Closed with non-monetary relief,Yes,8209430


In [5]:
df.isnull().sum()

Date received                         0
Product                               0
Sub-product                       11477
Issue                                 2
Sub-issue                        108389
Consumer complaint narrative    1291239
Company public response          984689
Company                               0
State                             11673
ZIP code                           6133
Tags                            1850937
Consumer consent provided?       358065
Submitted via                         0
Date sent to company                  0
Company response to consumer         11
Timely response?                      0
Consumer disputed?              1959332
Complaint ID                          0
dtype: int64

In [6]:
df.isnull().sum() / len(df) * 100

Date received                    0.00000
Product                          0.00000
Sub-product                      0.57385
Issue                            0.00010
Sub-issue                        5.41945
Consumer complaint narrative    64.56195
Company public response         49.23445
Company                          0.00000
State                            0.58365
ZIP code                         0.30665
Tags                            92.54685
Consumer consent provided?      17.90325
Submitted via                    0.00000
Date sent to company             0.00000
Company response to consumer     0.00055
Timely response?                 0.00000
Consumer disputed?              97.96660
Complaint ID                     0.00000
dtype: float64

In [44]:
for col in list(df.columns):
    if (df[col].isnull().sum() / len(df) * 100) >= 45:
        df.drop(columns=col, inplace=True)

In [11]:
df.shape

(2000000, 14)

In [12]:
df.isnull().sum() / len(df) * 100

Date received                    0.00000
Product                          0.00000
Sub-product                      0.57385
Issue                            0.00010
Sub-issue                        5.41945
Company                          0.00000
State                            0.58365
ZIP code                         0.30665
Consumer consent provided?      17.90325
Submitted via                    0.00000
Date sent to company             0.00000
Company response to consumer     0.00055
Timely response?                 0.00000
Complaint ID                     0.00000
dtype: float64

In [35]:
df.dtypes

Date received                   object
Product                         object
Sub-product                     object
Issue                           object
Sub-issue                       object
Company                         object
State                           object
ZIP code                        object
Consumer consent provided?      object
Submitted via                   object
Date sent to company            object
Company response to consumer    object
Timely response?                object
Complaint ID                     int64
dtype: object

In [45]:
df[['Date received', 'ZIP code', 'Date sent to company', 'Complaint ID']] = df[['Date received', 'ZIP code', 'Date sent to company', 'Complaint ID']].astype({'Date received': 'datetime64[ns]', 'Date sent to company': 'datetime64[ns]', 'Complaint ID': 'int64'})

In [46]:
df.dtypes

Date received                   datetime64[ns]
Product                                 object
Sub-product                             object
Issue                                   object
Sub-issue                               object
Company                                 object
State                                   object
ZIP code                                object
Consumer consent provided?              object
Submitted via                           object
Date sent to company            datetime64[ns]
Company response to consumer            object
Timely response?                        object
Complaint ID                             int64
dtype: object

In [47]:
for col in list(df.columns):
    if df[col].dtype in ["int", "float"]:
        if 0 < (df[col].isnull().sum() / len(df) * 100) <= 1:
            df[col].fillna(df[col].mean(), inplace=True)
        else:
            mean = df.groupby('Product')[col].mean()
            for prod in df['Product'].unique():
                mask = df['Product'] == prod
                df.loc[mask, col] = df.loc[mask, col].fillna(mean[prod])
    elif df[col].dtype == "object":
        for prod in df['Product'].unique():
                mask = df['Product'] == prod
                df.loc[mask, col] = df.loc[mask, col].fillna(df[col].mode()[0])

In [48]:
df.isnull().sum() / len(df) * 100

Date received                   0.0
Product                         0.0
Sub-product                     0.0
Issue                           0.0
Sub-issue                       0.0
Company                         0.0
State                           0.0
ZIP code                        0.0
Consumer consent provided?      0.0
Submitted via                   0.0
Date sent to company            0.0
Company response to consumer    0.0
Timely response?                0.0
Complaint ID                    0.0
dtype: float64

In [49]:
for col in df.columns:
    print(col, ':', df[col].nunique())

Date received : 4478
Product : 20
Sub-product : 85
Issue : 175
Sub-issue : 270
Company : 5175
State : 62
ZIP code : 23716
Consumer consent provided? : 4
Submitted via : 7
Date sent to company : 4361
Company response to consumer : 8
Timely response? : 2
Complaint ID : 2000000


In [50]:
df.drop(columns=['Complaint ID'], inplace=True)

## Date Received

In [None]:
df

## 2. Given an unsorted array of integers, find the length of the longest continuous increasing subsequence (subarray).

## Example 1:
## Input: [1,3,5,4,7]
## Output: 3 
## Example 2:
## Input: [2,2,2,2,2]
## Output: 1

In [8]:
def longSub(arr):
    if not arr:
        return 0
    
    max_length = 1
    current_length = 1
    
    for i in range(1, len(arr)):
        if arr[i] > arr[i - 1]:
            current_length += 1
            max_length = max(max_length, current_length)
        else:
            current_length = 1
    
    return max_length

print(longSub([1, 3, 5, 4, 7]))
print(longSub([2, 2, 2, 2]))

3
1


## 3. Given a list of non negative integers, arrange them such that they form the largest number.

## Example 1:
## Input: [10,2]
## Output: "210"

## Example 2:
## Input: [3,30,34,5,9]
## Output: "9534330"

In [29]:
from itertools import permutations
def largestNumber(arr):
    perm = permutations(arr)
    concatenated_numbers = []
    for i in list(perm):
        concatenated_num = ''.join(map(str, i))
        concatenated_numbers.append(concatenated_num)
    return max(concatenated_numbers)

print(largestNumber([10, 2]))
print(largestNumber([3, 30, 34, 5, 9]))

210
9534330


## 4. Store all the "servlet-name", and "servlet-class" to a csv file from the attached sample_json.json file using Python.

In [52]:
json = pd.read_json('DT A1 sample_json (1) (1).json')

In [34]:
json

Unnamed: 0,web-app
servlet,"[{'servlet-name': 'cofaxCDS', 'servlet-class':..."
servlet-mapping,"{'cofaxCDS': '/', 'cofaxEmail': '/cofaxutil/ae..."
taglib,"{'taglib-uri': 'cofax.tld', 'taglib-location':..."


In [56]:
servlet_data = [(servlet['servlet-name'], servlet['servlet-class']) for servlet in json['web-app']['servlet']]
servlet_df = pd.DataFrame(servlet_data, columns=['servlet-name', 'servlet-class'])

In [57]:
servlet_df

Unnamed: 0,servlet-name,servlet-class
0,cofaxCDS,org.cofax.cds.CDSServlet
1,cofaxEmail,org.cofax.cds.EmailServlet
2,cofaxAdmin,org.cofax.cds.AdminServlet
3,fileServlet,org.cofax.cds.FileServlet
4,cofaxTools,org.cofax.cms.CofaxToolsServlet


In [55]:
servlet_df.to_csv("servlet.csv")