Handling NSS request after reconnect and memory hierarchy #1696

sssd-bot · 2020-05-02T12:13:28Z

Cloned from Pagure issue: https://pagure.io/SSSD/sssd/issue/654

Created at 2010-10-21 13:39:51 by sbose
Closed as Fixed
Assigned to sbose

There are two related issues in the NSS responder:
- requests are not handled after the data provider dies and the responder reconnects, they will timeout after some time
- there is a memory hierarchy bug which lets the NSS responder die if the not handled requests timeout

The following patch shall help to reproduce both issues:

diff --git a/src/providers/ldap/ldap_id.c b/src/providers/ldap/ldap_id.c
index 135e370..af69566 100644
--- a/src/providers/ldap/ldap_id.c
+++ b/src/providers/ldap/ldap_id.c
@@ -657,6 +657,8 @@ void sdap_account_info_handler(struct be_req *breq)
     const char *err = "Unknown Error";
     int ret = EOK;
 
+    ar->entry_type = 123;
+
     ctx = talloc_get_type(breq->be_ctx->bet_info[BET_ID].pvt_bet_data, struct sdap_id_ctx);
 
     if (be_is_offline(ctx->be)) {
diff --git a/src/responder/common/responder_dp.c b/src/responder/common/responder_dp.c
index fd7b976..5cd1e49 100644
--- a/src/responder/common/responder_dp.c
+++ b/src/responder/common/responder_dp.c
@@ -256,6 +256,8 @@ int sss_dp_send_acct_req(struct resp_ctx *rctx, TALLOC_CTX *callback_memctx,
     struct sss_dp_req *sdp_req = NULL;
     struct sss_dp_callback *cb;
 
+    timeout = 10;
+
     /* either, or, not both */
     if (opt_name && opt_id) {
         return EINVAL;

The first hunk makes the LDAP ID provider die on every request. The second hunk reduces the timeout to 10s so you do not have to wait too much.

I would like to propose a fix similar to the next patch

diff --git a/src/responder/common/responder.h b/src/responder/common/responder.h
index 980e561..783f9e4 100644
--- a/src/responder/common/responder.h
+++ b/src/responder/common/responder.h
@@ -161,6 +161,8 @@ struct cli_protocol_version *register_cli_protocol_version(void);
 typedef void (*sss_dp_callback_t)(uint16_t err_maj, uint32_t err_min,
                                   const char *err_msg, void *ptr);
 
+void handle_requests_after_reconnect(void);
+
 int sss_dp_send_acct_req(struct resp_ctx *rctx, TALLOC_CTX *callback_memctx,
                          sss_dp_callback_t callback, void *callback_ctx,
                          int timeout, const char *domain,
diff --git a/src/responder/common/responder_dp.c b/src/responder/common/responder_dp.c
index fd7b976..0f1f7a8 100644
--- a/src/responder/common/responder_dp.c
+++ b/src/responder/common/responder_dp.c
@@ -107,6 +107,25 @@ static int sss_dp_req_destructor(void *ptr)
     return 0;
 }
 
+static bool reconnect_handler(hash_entry_t *item, void *user_data)
+{
+    struct sss_dp_req *sdp_req = talloc_get_type(item->value.ptr,
+                                                 struct sss_dp_req);
+
+    return (talloc_free(sdp_req) == EOK ? true : false);
+}
+
+void handle_requests_after_reconnect(void)
+{
+    int ret;
+
+    ret = hash_iterate(dp_requests, reconnect_handler, NULL);
+    if (ret != HASH_SUCCESS) {
+        DEBUG(1, ("hash_iterate failed, "
+                  "not all request might be handled after reconnect.\n"));
+    }
+}
+
 static void sdp_req_timeout(struct tevent_context *ev,
                             struct tevent_timer *te,
                             struct timeval t, void *ptr)
@@ -288,7 +307,7 @@ int sss_dp_send_acct_req(struct resp_ctx *rctx, TALLOC_CTX *callback_memctx,
 
     if (dp_requests == NULL) {
         /* Create a hash table to handle queued update requests */
-        ret = hash_create(10, &dp_requests, NULL, NULL);
+        ret = sss_hash_create(NULL, 10, &dp_requests);
         if (ret != HASH_SUCCESS) {
             fprintf(stderr, "cannot create hash table (%s)\n", hash_error_string(ret));
             return EIO;
@@ -336,7 +355,7 @@ int sss_dp_send_acct_req(struct resp_ctx *rctx, TALLOC_CTX *callback_memctx,
                 goto done;
             }
 
-            cb = talloc_zero(callback_memctx, struct sss_dp_callback);
+            cb = talloc_zero(sdp_req, struct sss_dp_callback);
             if (!cb) {
                 ret = ENOMEM;
                 goto done;
@@ -482,7 +501,7 @@ static int sss_dp_send_acct_req_create(struct resp_ctx *rctx,
     sdp_req->pending_reply = pending_reply;
 
     if (callback) {
-        cb = talloc_zero(callback_memctx, struct sss_dp_callback);
+        cb = talloc_zero(sdp_req, struct sss_dp_callback);
         if (!cb) {
             talloc_zfree(sdp_req);
             return ENOMEM;
diff --git a/src/responder/nss/nsssrv.c b/src/responder/nss/nsssrv.c
index 4f5aa62..dfb0312 100644
--- a/src/responder/nss/nsssrv.c
+++ b/src/responder/nss/nsssrv.c
@@ -39,6 +39,7 @@
 #include "dbus/dbus.h"
 #include "sbus/sssd_dbus.h"
 #include "responder/common/responder_packet.h"
+#include "responder/common/responder.h"
 #include "providers/data_provider.h"
 #include "monitor/monitor_interfaces.h"
 #include "sbus/sbus_client.h"
@@ -143,7 +144,10 @@ static void nss_dp_reconnect_init(struct sbus_connection *conn,
                                 DATA_PROVIDER_VERSION,
                                 "NSS");
         /* all fine */
-        if (ret == EOK) return;
+        if (ret == EOK) {
+            handle_requests_after_reconnect();
+            return;
+        }
     }
 
     /* Failed to reconnect */

This patch makes the sdp_req the parent of the callbacks (yes, some additional cleanup is needed because callback_memctx is not needed anymore) and introduces a reconnect handler which just terminates the current request.

I think it is better to just kill the running requests instead of resending them, because one of these requests might have caused the termination of the DP and resending it will kill the DP again and again. And the DP has to be fixed wither way.

And last but not least it might be a good time to find a place for dp_requests in the memory hierarchy.

Comments

Comment from sgallagh at 2010-10-21 13:50:12

Segfault issues are blockers. This needs to be fixed for 1.4.1.

milestone: NEEDS_TRIAGE => SSSD 1.4.1
owner: somebody => sbose
priority: major => blocker

Comment from sbose at 2010-10-22 18:26:01

I think I figured out how this double free issue works:
- cb is allocated on cmdctx
- if a timeout occurs sdp_req_timeout() calls talloc_free(sdp_req)
- the sdp destructor sss_dp_req_destructor() calls one callback after the other and talloc_free(cb) after the callback is run
- the callback itself, e.g. nss_cmd_getgrnam_dp_callback() calls sss_cmd_done() which frees cmdctx
- so if the callback returns to sss_dp_req_destructor() cb is already freed, because it is a child of cmdctx and talloc_free(cb) must fail

I'm just wondering why we do not see this more often

description: There are two related issues in the NSS responder:

requests are not handled after the data provider dies and the responder reconnects, they will timeout after some time
there is a memory hierarchy bug which lets the NSS responder die if the not handled requests timeout

The following patch shall help to reproduce both issues:

{{{
diff --git a/src/providers/ldap/ldap_id.c b/src/providers/ldap/ldap_id.c
index 135e370..af69566 100644
--- a/src/providers/ldap/ldap_id.c
+++ b/src/providers/ldap/ldap_id.c
@@ -657,6 +657,8 @@ void sdap_account_info_handler(struct be_req *breq)
const char *err = "Unknown Error";
int ret = EOK;

ar->entry_type = 123;
ctx = talloc_get_type(breq->be_ctx->bet_info[BET_ID].pvt_bet_data, struct sdap_id_ctx);

if (be_is_offline(ctx->be)) {
diff --git a/src/responder/common/responder_dp.c b/src/responder/common/responder_dp.c
index fd7b976..5cd1e49 100644
--- a/src/responder/common/responder_dp.c
+++ b/src/responder/common/responder_dp.c
@@ -256,6 +256,8 @@ int sss_dp_send_acct_req(struct resp_ctx *rctx, TALLOC_CTX *callback_memctx,
struct sss_dp_req *sdp_req = NULL;
struct sss_dp_callback *cb;
timeout = 10;
/* either, or, not both */
if (opt_name && opt_id) {
return EINVAL;
}}}

The first hunk makes the LDAP ID provider die on every request. The second hunk reduces the timeout to 10s so you do not have to wait too much.

I would like to propose a fix similar to the next patch

{{{
diff --git a/src/responder/common/responder.h b/src/responder/common/responder.h
index 980e561..783f9e4 100644
--- a/src/responder/common/responder.h
+++ b/src/responder/common/responder.h
@@ -161,6 +161,8 @@ struct cli_protocol_version *register_cli_protocol_version(void);
typedef void (*sss_dp_callback_t)(uint16_t err_maj, uint32_t err_min,
const char *err_msg, void *ptr);

+void handle_requests_after_reconnect(void);
+
int sss_dp_send_acct_req(struct resp_ctx *rctx, TALLOC_CTX *callback_memctx,
sss_dp_callback_t callback, void *callback_ctx,
int timeout, const char *domain,
diff --git a/src/responder/common/responder_dp.c b/src/responder/common/responder_dp.c
index fd7b976..0f1f7a8 100644
--- a/src/responder/common/responder_dp.c
+++ b/src/responder/common/responder_dp.c
@@ -107,6 +107,25 @@ static int sss_dp_req_destructor(void *ptr)
return 0;
}

+static bool reconnect_handler(hash_entry_t *item, void *user_data)
+{

struct sss_dp_req *sdp_req = talloc_get_type(item->value.ptr,

                                            struct sss_dp_req);

return (talloc_free(sdp_req) == EOK ? true : false);
+}

+void handle_requests_after_reconnect(void)
+{

int ret;
ret = hash_iterate(dp_requests, reconnect_handler, NULL);
if (ret != HASH_SUCCESS) {
```
   DEBUG(1, ("hash_iterate failed, "
```

             "not all request might be handled after reconnect.\n"));

}
+}

static void sdp_req_timeout(struct tevent_context *ev,
struct tevent_timer *te,
struct timeval t, void *ptr)
@@ -288,7 +307,7 @@ int sss_dp_send_acct_req(struct resp_ctx *rctx, TALLOC_CTX *callback_memctx,

 if (dp_requests == NULL) {
     /* Create a hash table to handle queued update requests */

   ret = hash_create(10, &dp_requests, NULL, NULL);

   ret = sss_hash_create(NULL, 10, &dp_requests);
   if (ret != HASH_SUCCESS) {
       fprintf(stderr, "cannot create hash table (%s)\n", hash_error_string(ret));
       return EIO;

@@ -336,7 +355,7 @@ int sss_dp_send_acct_req(struct resp_ctx *rctx, TALLOC_CTX *callback_memctx,
goto done;
}

       cb = talloc_zero(callback_memctx, struct sss_dp_callback);

       cb = talloc_zero(sdp_req, struct sss_dp_callback);
       if (!cb) {
           ret = ENOMEM;
           goto done;

@@ -482,7 +501,7 @@ static int sss_dp_send_acct_req_create(struct resp_ctx *rctx,
sdp_req->pending_reply = pending_reply;

 if (callback) {

   cb = talloc_zero(callback_memctx, struct sss_dp_callback);

   cb = talloc_zero(sdp_req, struct sss_dp_callback);
   if (!cb) {
       talloc_zfree(sdp_req);
       return ENOMEM;

diff --git a/src/responder/nss/nsssrv.c b/src/responder/nss/nsssrv.c
index 4f5aa62..dfb0312 100644
--- a/src/responder/nss/nsssrv.c
+++ b/src/responder/nss/nsssrv.c
@@ -39,6 +39,7 @@
#include "dbus/dbus.h"
#include "sbus/sssd_dbus.h"
#include "responder/common/responder_packet.h"
+#include "responder/common/responder.h"
#include "providers/data_provider.h"
#include "monitor/monitor_interfaces.h"
#include "sbus/sbus_client.h"
@@ -143,7 +144,10 @@ static void nss_dp_reconnect_init(struct sbus_connection conn,
DATA_PROVIDER_VERSION,
"NSS");
/ all fine */

```
   if (ret == EOK) return;
```

```
   if (ret == EOK) {
```

       handle_requests_after_reconnect();

```
       return;
```
```
   }
```
}

/* Failed to reconnect */
}}}

This patch makes the sdp_req the parent of the callbacks (yes, some additional cleanup is needed because callback_memctx is not needed anymore) and introduces a reconnect handler which just terminates the current request.

I think it is better to just kill the running requests instead of resending them, because one of these requests might have caused the termination of the DP and resending it will kill the DP again and again. And the DP has to be fixed wither way.

And last but not least it might be a good time to find a place for dp_requests in the memory hierarchy. => There are two related issues in the NSS responder:

requests are not handled after the data provider dies and the responder reconnects, they will timeout after some time
there is a memory hierarchy bug which lets the NSS responder die if the not handled requests timeout

The following patch shall help to reproduce both issues:

{{{
diff --git a/src/providers/ldap/ldap_id.c b/src/providers/ldap/ldap_id.c
index 135e370..af69566 100644
--- a/src/providers/ldap/ldap_id.c
+++ b/src/providers/ldap/ldap_id.c
@@ -657,6 +657,8 @@ void sdap_account_info_handler(struct be_req *breq)
const char *err = "Unknown Error";
int ret = EOK;

ar->entry_type = 123;
ctx = talloc_get_type(breq->be_ctx->bet_info[BET_ID].pvt_bet_data, struct sdap_id_ctx);

if (be_is_offline(ctx->be)) {
diff --git a/src/responder/common/responder_dp.c b/src/responder/common/responder_dp.c
index fd7b976..5cd1e49 100644
--- a/src/responder/common/responder_dp.c
+++ b/src/responder/common/responder_dp.c
@@ -256,6 +256,8 @@ int sss_dp_send_acct_req(struct resp_ctx *rctx, TALLOC_CTX *callback_memctx,
struct sss_dp_req *sdp_req = NULL;
struct sss_dp_callback *cb;
timeout = 10;
/* either, or, not both */
if (opt_name && opt_id) {
return EINVAL;
}}}

The first hunk makes the LDAP ID provider die on every request. The second hunk reduces the timeout to 10s so you do not have to wait too much.

I would like to propose a fix similar to the next patch

{{{
diff --git a/src/responder/common/responder.h b/src/responder/common/responder.h
index 980e561..783f9e4 100644
--- a/src/responder/common/responder.h
+++ b/src/responder/common/responder.h
@@ -161,6 +161,8 @@ struct cli_protocol_version *register_cli_protocol_version(void);
typedef void (*sss_dp_callback_t)(uint16_t err_maj, uint32_t err_min,
const char *err_msg, void *ptr);

+void handle_requests_after_reconnect(void);
+
int sss_dp_send_acct_req(struct resp_ctx *rctx, TALLOC_CTX *callback_memctx,
sss_dp_callback_t callback, void *callback_ctx,
int timeout, const char *domain,
diff --git a/src/responder/common/responder_dp.c b/src/responder/common/responder_dp.c
index fd7b976..0f1f7a8 100644
--- a/src/responder/common/responder_dp.c
+++ b/src/responder/common/responder_dp.c
@@ -107,6 +107,25 @@ static int sss_dp_req_destructor(void *ptr)
return 0;
}

+static bool reconnect_handler(hash_entry_t *item, void *user_data)
+{

struct sss_dp_req *sdp_req = talloc_get_type(item->value.ptr,

                                            struct sss_dp_req);

return (talloc_free(sdp_req) == EOK ? true : false);
+}

+void handle_requests_after_reconnect(void)
+{

int ret;
ret = hash_iterate(dp_requests, reconnect_handler, NULL);
if (ret != HASH_SUCCESS) {
```
   DEBUG(1, ("hash_iterate failed, "
```

             "not all request might be handled after reconnect.\n"));

}
+}

static void sdp_req_timeout(struct tevent_context *ev,
struct tevent_timer *te,
struct timeval t, void *ptr)
@@ -288,7 +307,7 @@ int sss_dp_send_acct_req(struct resp_ctx *rctx, TALLOC_CTX *callback_memctx,

 if (dp_requests == NULL) {
     /* Create a hash table to handle queued update requests */

   ret = hash_create(10, &dp_requests, NULL, NULL);

   ret = sss_hash_create(NULL, 10, &dp_requests);
   if (ret != HASH_SUCCESS) {
       fprintf(stderr, "cannot create hash table (%s)\n", hash_error_string(ret));
       return EIO;

@@ -336,7 +355,7 @@ int sss_dp_send_acct_req(struct resp_ctx *rctx, TALLOC_CTX *callback_memctx,
goto done;
}

       cb = talloc_zero(callback_memctx, struct sss_dp_callback);

       cb = talloc_zero(sdp_req, struct sss_dp_callback);
       if (!cb) {
           ret = ENOMEM;
           goto done;

@@ -482,7 +501,7 @@ static int sss_dp_send_acct_req_create(struct resp_ctx *rctx,
sdp_req->pending_reply = pending_reply;

 if (callback) {

   cb = talloc_zero(callback_memctx, struct sss_dp_callback);

   cb = talloc_zero(sdp_req, struct sss_dp_callback);
   if (!cb) {
       talloc_zfree(sdp_req);
       return ENOMEM;

diff --git a/src/responder/nss/nsssrv.c b/src/responder/nss/nsssrv.c
index 4f5aa62..dfb0312 100644
--- a/src/responder/nss/nsssrv.c
+++ b/src/responder/nss/nsssrv.c
@@ -39,6 +39,7 @@
#include "dbus/dbus.h"
#include "sbus/sssd_dbus.h"
#include "responder/common/responder_packet.h"
+#include "responder/common/responder.h"
#include "providers/data_provider.h"
#include "monitor/monitor_interfaces.h"
#include "sbus/sbus_client.h"
@@ -143,7 +144,10 @@ static void nss_dp_reconnect_init(struct sbus_connection conn,
DATA_PROVIDER_VERSION,
"NSS");
/ all fine */

```
   if (ret == EOK) return;
```

```
   if (ret == EOK) {
```

       handle_requests_after_reconnect();

```
       return;
```
```
   }
```
}

/* Failed to reconnect */
}}}

This patch makes the sdp_req the parent of the callbacks (yes, some additional cleanup is needed because callback_memctx is not needed anymore) and introduces a reconnect handler which just terminates the current request.

I think it is better to just kill the running requests instead of resending them, because one of these requests might have caused the termination of the DP and resending it will kill the DP again and again. And the DP has to be fixed wither way.

And last but not least it might be a good time to find a place for dp_requests in the memory hierarchy.

Comment from sgallagh at 2010-10-26 14:04:02

Fixed by:
- 98e3c54
- c475a1c
- 09aa3d9

fixedin: => 1.4.1
resolution: => fixed
status: new => closed
tests: 0 => 1

Comment from jgalipea at 2011-05-26 20:21:03

Fields changed

coverity: =>
patch: => 0
tests: 1 => 0
upgrade: => 0

Comment from dpal at 2012-01-19 02:55:43

Fields changed

rhbz: => 0

Comment from sbose at 2017-02-24 15:08:35

Metadata Update from @sbose: